This is an analysis of employee data. We will:
An overview of the basic lay of the land of this data shows that:
#read in data
employees <- read.csv("CaseStudy2-data.csv")
#overview of the variables
str(employees)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
#how many NA values
colSums(is.na(employees))
## ID Age Attrition
## 0 0 0
## BusinessTravel DailyRate Department
## 0 0 0
## DistanceFromHome Education EducationField
## 0 0 0
## EmployeeCount EmployeeNumber EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked Over18 OverTime
## 0 0 0
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 0 0 0
## StandardHours StockOptionLevel TotalWorkingYears
## 0 0 0
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 0 0 0
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 0 0 0
#summaries of monthly income and attrition, the variables of interest for our models
summary(employees$MonthlyIncome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1081 2840 4946 6390 8182 19999
summary(employees$Attrition)
## No Yes
## 730 140
#useless variable with one value across the whole thing
employees$Over18 <- NULL
Because some ordinal variables going in as continuous right now, we will turn them into factors so that they will be treated as factors. I name the factor levels for the ones that I know their meaning.
#Job involvement
employees$JobInvolvement[which(employees$JobInvolvement == 1)] = 'Low'
employees$JobInvolvement[which(employees$JobInvolvement == 2)] = 'Medium'
employees$JobInvolvement[which(employees$JobInvolvement == 3)] = 'High'
employees$JobInvolvement[which(employees$JobInvolvement == 4)] = 'Very High'
employees$JobInvolvement = as.factor(employees$JobInvolvement)
summary(employees$JobInvolvement)
## High Low Medium Very High
## 514 47 228 81
#Job satisfaction
employees$JobSatisfaction[which(employees$JobSatisfaction == 1)] = 'Low'
employees$JobSatisfaction[which(employees$JobSatisfaction == 2)] = 'Medium'
employees$JobSatisfaction[which(employees$JobSatisfaction == 3)] = 'High'
employees$JobSatisfaction[which(employees$JobSatisfaction == 4)] = 'Very High'
employees$JobSatisfaction = as.factor(employees$JobSatisfaction)
summary(employees$JobSatisfaction)
## High Low Medium Very High
## 254 179 166 271
#Performance rating
employees$PerformanceRating[which(employees$PerformanceRating == 1)] = 'Low'
employees$PerformanceRating[which(employees$PerformanceRating == 2)] = 'Good'
employees$PerformanceRating[which(employees$PerformanceRating == 3)] = 'Excellent'
employees$PerformanceRating[which(employees$PerformanceRating == 4)] = 'Outstanding'
employees$PerformanceRating = as.factor(employees$PerformanceRating)
summary(employees$PerformanceRating)
## Excellent Outstanding
## 738 132
#Relationship status
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 1)] = 'Low'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 2)] = 'Medium'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 3)] = 'High'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 4)] = 'Very High'
employees$RelationshipSatisfaction = as.factor(employees$RelationshipSatisfaction)
summary(employees$RelationshipSatisfaction)
## High Low Medium Very High
## 261 174 171 264
#Work life balance
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 1)] = 'Bad'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 2)] = 'Good'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 3)] = 'Better'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 4)] = 'Best'
employees$WorkLifeBalance = as.factor(employees$WorkLifeBalance)
summary(employees$WorkLifeBalance)
## Bad Best Better Good
## 48 98 532 192
#Education
employees$Education[which(employees$Education == 1)] = 'Below College'
employees$Education[which(employees$Education == 2)] = 'College'
employees$Education[which(employees$Education == 3)] = 'Bachelor'
employees$Education[which(employees$Education == 4)] = 'Master'
employees$Education[which(employees$Education == 5)] = 'Doctor'
employees$Education = as.factor(employees$Education)
summary(employees$WorkLifeBalance)
## Bad Best Better Good
## 48 98 532 192
#Environment satisfaction
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 1)] = 'Low'
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 2)] = 'Medium'
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 3)] = 'High'
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 4)] = 'Very High'
employees$EnvironmentSatisfaction = as.factor(employees$EnvironmentSatisfaction)
summary(employees$EnvironmentSatisfaction)
## High Low Medium Very High
## 258 172 178 262
#Stock option level
employees$StockOptionLevel = factor(employees$StockOptionLevel)
summary(employees$StockOptionLevel)
## 0 1 2 3
## 379 355 81 55
According to this data, job satisfaction does not vary significantly from role to role. The mosaic plot and contingincy table show some variation, but the Chi Square test reveals that the variation in our dataset are not significant enough to conclude that different roles tend to have more or less job satisfaction than others (p value = .35)
library(ggmosaic)
ggplot(data = employees) +
geom_mosaic(aes(x = product(JobSatisfaction, JobRole), fill=JobRole), na.rm=TRUE) + labs(x = "Job Role", title='Job Satisfaction in Job Roles', y='Job Satisfaction')+
theme(axis.text.x = element_text(angle = 90))
#Contingency table
table(employees$JobSatisfaction, employees$JobRole)
##
## Healthcare Representative Human Resources Laboratory Technician
## High 23 8 43
## Low 16 5 32
## Medium 9 8 31
## Very High 28 6 47
##
## Manager Manufacturing Director Research Director Research Scientist
## High 12 29 16 48
## Low 12 12 13 32
## Medium 14 23 11 31
## Very High 13 23 11 61
##
## Sales Executive Sales Representative
## High 61 14
## Low 48 9
## Medium 25 14
## Very High 66 16
#Chi square says no significant difference in job satisfaction across education fields
chisq.test(table(employees$JobSatisfaction, employees$JobRole))
##
## Pearson's Chi-squared test
##
## data: table(employees$JobSatisfaction, employees$JobRole)
## X-squared = 26.048, df = 24, p-value = 0.3507
We can see that, as expected, people in more technical roles come from different backgrounds than people in less technical roles:
ggplot(data = employees) +
geom_mosaic(aes(x = product(EducationField, JobRole), fill=EducationField), na.rm=TRUE) + labs(y = "Education Field", title='Education Field of Job Roles', x='Job Role') +
theme(axis.text.x = element_text(angle = 90))
Clearly, the income of these job roles are very different from each other (p < 2e-16 from one-way ANOVA test). This will come in handy when we make our salary prediction model later.
ggplot(data = employees, aes(x=JobRole, y=MonthlyIncome, fill=JobRole)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Incomes of Job Roles")
#ANOVA of income differences across job roles
summary(aov(MonthlyIncome ~ JobRole, data = employees))
## Df Sum Sq Mean Sq F value Pr(>F)
## JobRole 8 1.489e+10 1.861e+09 460.3 <2e-16 ***
## Residuals 861 3.481e+09 4.043e+06
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a difference across the job roles regarding how long people of these roles have been in the work force (p < 2e-16 from one-way ANOVA test). Sales represntative and HR seem to be the most entry-level type job, whereas manager and director have generally been working a longer time than those in other roles.
ggplot(data = employees, aes(x=JobRole, y=TotalWorkingYears, fill=JobRole)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("How Long People of Job Roles Have Worked")
#ANOVA of total working years compared across job roles
summary(aov(TotalWorkingYears ~ JobRole, data = employees))
## Df Sum Sq Mean Sq F value Pr(>F)
## JobRole 8 21632 2704.0 84.88 <2e-16 ***
## Residuals 861 27428 31.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
All the job roles are similar in their distribution of amount of job involvement, at least according to this data (p = .62 from Chi Square test). We don’t have insight as to what the definitions of these job involvement level break outs are, so it is hard to interpret this deeply.
ggplot(data = employees) +
geom_mosaic(aes(x = product(factor(JobInvolvement), JobRole), fill=JobInvolvement), na.rm=TRUE) + labs(y = "Job Involvement", title='Involvment of Job Roles', x='Job Role') +
theme(axis.text.x = element_text(angle = 90))
#Chi square says no significant difference in job satisfaction across education fields
chisq.test(table(employees$JobInvolvement, employees$JobRole))
## Warning in chisq.test(table(employees$JobInvolvement, employees$JobRole)): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(employees$JobInvolvement, employees$JobRole)
## X-squared = 21.286, df = 24, p-value = 0.6218
What continuous variables have a difference in distribution of attrition? It looks like from these paired scatterplots that attrition is pretty scattered across our continuous variables and none of them show very distinct clusters of attrition. Years at company and years in current role seem, if anything, to show mildly that people who have been at the company past a certain number of years are less likely to leave.
library(GGally)
#there's 19 continuous variables
employees %>%
select_if(is.numeric) %>%
dim()
## [1] 870 19
employees %>%
select_if(is.numeric) %>%
select(1:5) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(6:10) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
employees %>%
select_if(is.numeric) %>%
select(11:15) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(16:19) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
Taking a closer look at years in current role to compare attrition, we see that those who left the company stayed generally between 0 - 4 years, whereas those who did not leave tend to have been in the company between 2 - 7 years. A t-test shows the p-value for there being a difference in mean of those who left vs those who did not is p=1.522e-6, which is is strong evidence that there is a difference in means of those two populations.
t.test(YearsInCurrentRole ~ Attrition, data=employees,)
##
## Welch Two Sample t-test
##
## data: YearsInCurrentRole by Attrition
## t = 4.9513, df = 208, p-value = 1.522e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.9306052 2.1619584
## sample estimates:
## mean in group No mean in group Yes
## 4.453425 2.907143
employees %>%
ggplot(aes(x=Attrition, y=YearsInCurrentRole)) +
geom_boxplot() +
ggtitle("Years In Current Role by Attrition")
A check of the correlation matrix shows there is some multicollinearity between continuous variables, particularly the variables related to the number of years of being in a position or at a company.
library(corrplot)
## corrplot 0.84 loaded
## corrplot 0.84 loaded
M <- employees %>%
select_if(is.numeric) %>%
cor()
## Warning in cor(.): the standard deviation is zero
corrplot(M, method = "circle")
What categorical variables have a difference in distribution in response? Many of them do! Particularly noticeably, those who are likely to leave are:
#there's 14 categorical variables
employees %>%
select_if(is.factor) %>%
dim()
## [1] 870 16
categs <- names(select_if(employees, is.factor))
for(i in 1:length(categs)){
print(employees %>%
ggplot(aes(x = eval(parse(text=categs[i])), fill = Attrition)) +
geom_bar(position = "fill") +
xlab(categs[i]) +
ggtitle(paste("Proportion of Attrition by ", categs[i]))
)
}
Because there are a much higherp proportion of “No” than “Yes” in our response variable Attrition, I am setting up a balanced train test split to help our models train properly.
split_train_test <- function(df) {
# dataset with "no"
data_no = df[which(df$Attrition=="No"),]
# dataset with "yes"
data_yes = df[which(df$Attrition=="Yes"),]
#making more folds on No to balance the number with Yes
folds_no = createFolds(data_no$Attrition, k=8)
folds_yes = createFolds(data_yes$Attrition, k=2)
length(folds_no$Fold1)
length(folds_no$Fold2)
length(folds_yes$Fold1)
length(folds_yes$Fold2)
#Train
train_no = data_no[folds_no$Fold1,]
train_yes = data_yes[folds_yes$Fold1,]
train = rbind(train_no, train_yes)
#Test
test_no = data_no[c(folds_no$Fold2, folds_no$Fold3, folds_no$Fold4, folds_no$Fold5),]
test_yes = data_yes[folds_yes$Fold2,]
test = rbind(test_no, test_yes)
return(list(train, test))
}
The Naive Bayes algorithm works alright. The model trained on the balanced training set has Sensitivity 0.68 and Specificity 0.80 at a .5 probability threshold for categorizing “Yes”. The model trained on the unbalanced training set was worse; its ROC curve shows less area under the curve than the one trained on balanced data. So for the remainder of the classification models, we will just use the balanced training test split.
library(caret)
library(e1071)
library(ROCR)
library(plotROC)
#naive bayes on balanced training set
train1 <- split_train_test(employees)[[1]]
test1 <- split_train_test(employees)[[2]]
model.nb1 <- naiveBayes(Attrition ~ ., data=train1)
preds.nb1 <- predict(model.nb1, test1)
confusionMatrix(table(preds.nb1, test1$Attrition))
## Confusion Matrix and Statistics
##
##
## preds.nb1 No Yes
## No 250 16
## Yes 115 54
##
## Accuracy : 0.6989
## 95% CI : (0.6533, 0.7416)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2904
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6849
## Specificity : 0.7714
## Pos Pred Value : 0.9398
## Neg Pred Value : 0.3195
## Prevalence : 0.8391
## Detection Rate : 0.5747
## Detection Prevalence : 0.6115
## Balanced Accuracy : 0.7282
##
## 'Positive' Class : No
##
preds.nb1 <- predict(model.nb1, test1, type = "raw")
preds.nb1 <- prediction(preds.nb1[,2], test1$Attrition)
roc.perf_1 = performance(preds.nb1, measure = "tpr", x.measure = "fpr")
#naive bayes unbalanced training set
folds <- createFolds(employees$Attrition, k=2)
train2 <- employees[folds$Fold1,]
test2 <- employees[folds$Fold2,]
model.nb2 <- naiveBayes(Attrition ~ ., data=train2)
preds.nb2 <- predict(model.nb2, test2)
confusionMatrix(table(preds.nb2, test2$Attrition))
## Confusion Matrix and Statistics
##
##
## preds.nb2 No Yes
## No 303 25
## Yes 62 45
##
## Accuracy : 0.8
## 95% CI : (0.7593, 0.8366)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.9870652
##
## Kappa : 0.3897
##
## Mcnemar's Test P-Value : 0.0001136
##
## Sensitivity : 0.8301
## Specificity : 0.6429
## Pos Pred Value : 0.9238
## Neg Pred Value : 0.4206
## Prevalence : 0.8391
## Detection Rate : 0.6966
## Detection Prevalence : 0.7540
## Balanced Accuracy : 0.7365
##
## 'Positive' Class : No
##
preds.nb2 <- predict(model.nb2, test2, type = "raw")
preds.nb2 <- prediction(preds.nb2[,2], test2$Attrition)
roc.perf_2 = performance(preds.nb2, measure = "tpr", x.measure = "fpr")
#the balanced one definitely looks better and .6 seems a good threshold
plot(roc.perf_1, col="red", main = "ROCs of NB Model Trained on Balanced (red) and Unbalanced (blue)")
plot(roc.perf_2, add = TRUE, col="blue")
plot(roc.perf_1, colorize = TRUE, main="P Threshold Colorized ROC of NB trained on Balanced Data")
Another model for classification is KNN. KNN only takes numeric data and also is highly affected by the scale of the predictors, so I will make all the variables numeric and on a standardize scale for KNN.
#re-read in data
employees_num <- read.csv("CaseStudy2-data.csv")
#get rid of useless variables
employees_num$Over18 <- NULL
employees_num$ID <- NULL
#numericize categorical variables
employees_num$BusinessTravel = as.numeric(employees_num$BusinessTravel)
employees_num$Department = as.numeric(employees_num$Department)
employees_num$EducationField = as.numeric(employees_num$EducationField)
employees_num$Gender = as.numeric(employees_num$Gender)
employees_num$JobRole = as.numeric(employees_num$JobRole)
employees_num$MaritalStatus = as.numeric(employees_num$MaritalStatus)
employees_num$OverTime = as.numeric(employees_num$OverTime)
n <- dim(employees_num)[2]
#scale
employees_z <- employees_num %>%
mutate(zAge = scale(Age)) %>%
mutate(zBusinessTravel = scale(BusinessTravel)) %>%
mutate(zDailyRate = scale(DailyRate)) %>%
mutate(zDepartment = scale(Department)) %>%
mutate(zDistanceFromHome = scale(DistanceFromHome)) %>%
mutate(zEducation = scale(Education)) %>%
mutate(zEducationField = scale(EducationField)) %>%
mutate(zEmployeeCount = scale(EmployeeCount)) %>%
mutate(zEmployeeNumber = scale(EmployeeNumber)) %>%
mutate(zEnvironmentSatisfaction = scale(EnvironmentSatisfaction)) %>%
mutate(zGender = scale(Gender)) %>%
mutate(zHourlyRate = scale(HourlyRate)) %>%
mutate(zJobInvolvement = scale(JobInvolvement)) %>%
mutate(zJobLevel = scale(JobLevel)) %>%
mutate(zJobRole = scale(JobRole)) %>%
mutate(zJobSatisfaction = scale(JobSatisfaction)) %>%
mutate(zMaritalStatus = scale(MaritalStatus)) %>%
mutate(zMonthlyIncome = scale(MonthlyIncome)) %>%
mutate(zMonthlyRate = scale(MonthlyRate)) %>%
mutate(zNumCompaniesWorked = scale(NumCompaniesWorked)) %>%
mutate(zOverTime = scale(OverTime)) %>%
mutate(zPercentSalaryHike = scale(PercentSalaryHike)) %>%
mutate(zPerformanceRating = scale(PerformanceRating)) %>%
mutate(zRelationshipSatisfaction = scale(RelationshipSatisfaction)) %>%
mutate(zStandardHours = scale(StandardHours)) %>%
mutate(zStockOptionLevel = scale(StockOptionLevel)) %>%
mutate(zTotalWorkingYears = scale(TotalWorkingYears)) %>%
mutate(zTrainingTimesLastYear = scale(TrainingTimesLastYear)) %>%
mutate(zWorkLifeBalance = scale(WorkLifeBalance)) %>%
mutate(zYearsAtCompany = scale(YearsAtCompany)) %>%
mutate(zYearsInCurrentRole = scale(YearsInCurrentRole)) %>%
mutate(zYearsSinceLastPromotion = scale(YearsSinceLastPromotion)) %>%
mutate(zYearsWithCurrManager = scale(YearsWithCurrManager))
nz <- dim(employees_z)[2]
employees_z <- employees_z[,c((n+1):nz)]
employees_z$Attrition = employees$Attrition
#get rid of nans
colSums(is.na(employees_z))
##
## 0 0 0 0 0 0 0 870
##
## 0 0 0 0 0 0 0 0
##
## 0 0 0 0 0 0 0 0
##
## 870 0 0 0 0 0 0 0
## Attrition
## 0 0
employees_z$zStandardHours <- NULL
employees_z$zEmployeeCount <- NULL
str(employees_z)
## 'data.frame': 870 obs. of 32 variables:
## $ zAge : num [1:870, 1] -0.541 0.355 -0.205 -0.541 -1.437 ...
## ..- attr(*, "scaled:center")= num 36.8
## ..- attr(*, "scaled:scale")= num 8.93
## $ zBusinessTravel : num [1:870, 1] 0.589 0.589 -0.892 0.589 -0.892 ...
## ..- attr(*, "scaled:center")= num 2.6
## ..- attr(*, "scaled:scale")= num 0.675
## $ zDailyRate : num [1:870, 1] -1.7407 1.2285 -1.5338 -0.0355 -0.6188 ...
## ..- attr(*, "scaled:center")= num 815
## ..- attr(*, "scaled:scale")= num 401
## $ zDepartment : num [1:870, 1] 1.374 -0.517 -0.517 1.374 -0.517 ...
## ..- attr(*, "scaled:center")= num 2.27
## ..- attr(*, "scaled:scale")= num 0.529
## $ zDistanceFromHome : num [1:870, 1] 0.45 0.573 1.064 -1.025 -0.902 ...
## ..- attr(*, "scaled:center")= num 9.34
## ..- attr(*, "scaled:scale")= num 8.14
## $ zEducation : num [1:870, 1] 1.0741 0.0966 -0.8809 1.0741 -1.8583 ...
## ..- attr(*, "scaled:center")= num 2.9
## ..- attr(*, "scaled:scale")= num 1.02
## $ zEducationField : num [1:870, 1] -0.94 0.573 -0.94 -0.183 2.085 ...
## ..- attr(*, "scaled:center")= num 3.24
## ..- attr(*, "scaled:scale")= num 1.32
## $ zEmployeeNumber : num [1:870, 1] -0.282 0.162 0.632 1.631 1.019 ...
## ..- attr(*, "scaled:center")= num 1030
## ..- attr(*, "scaled:scale")= num 605
## $ zEnvironmentSatisfaction : num [1:870, 1] -0.638 0.272 0.272 0.272 -1.547 ...
## ..- attr(*, "scaled:center")= num 2.7
## ..- attr(*, "scaled:scale")= num 1.1
## $ zGender : num [1:870, 1] 0.828 0.828 0.828 -1.207 -1.207 ...
## ..- attr(*, "scaled:center")= num 1.59
## ..- attr(*, "scaled:scale")= num 0.492
## $ zHourlyRate : num [1:870, 1] 0.367 -1.074 -0.279 -0.875 -1.67 ...
## ..- attr(*, "scaled:center")= num 65.6
## ..- attr(*, "scaled:scale")= num 20.1
## $ zJobInvolvement : num [1:870, 1] 0.394 -1.028 0.394 0.394 0.394 ...
## ..- attr(*, "scaled:center")= num 2.72
## ..- attr(*, "scaled:scale")= num 0.704
## $ zJobLevel : num [1:870, 1] -0.0358 2.7162 0.8815 0.8815 -0.9532 ...
## ..- attr(*, "scaled:center")= num 2.04
## ..- attr(*, "scaled:scale")= num 1.09
## $ zJobRole : num [1:870, 1] 1.002 0.189 -0.217 1.002 0.596 ...
## ..- attr(*, "scaled:center")= num 5.53
## ..- attr(*, "scaled:scale")= num 2.46
## $ zJobSatisfaction : num [1:870, 1] 1.158 0.261 1.158 1.158 1.158 ...
## ..- attr(*, "scaled:center")= num 2.71
## ..- attr(*, "scaled:scale")= num 1.11
## $ zMaritalStatus : num [1:870, 1] -1.509 1.261 1.261 -0.124 1.261 ...
## ..- attr(*, "scaled:center")= num 2.09
## ..- attr(*, "scaled:scale")= num 0.722
## $ zMonthlyIncome : num [1:870, 1] -0.432 2.879 0.646 0.877 -0.572 ...
## ..- attr(*, "scaled:center")= num 6390
## ..- attr(*, "scaled:scale")= num 4598
## $ zMonthlyRate : num [1:870, 1] -0.714 0.453 0.79 1.365 0.407 ...
## ..- attr(*, "scaled:center")= num 14326
## ..- attr(*, "scaled:scale")= num 7108
## $ zNumCompaniesWorked : num [1:870, 1] -0.289 -0.685 -0.289 -0.685 -0.685 ...
## ..- attr(*, "scaled:center")= num 2.73
## ..- attr(*, "scaled:scale")= num 2.52
## $ zOverTime : num [1:870, 1] -0.638 -0.638 -0.638 -0.638 1.565 ...
## ..- attr(*, "scaled:center")= num 1.29
## ..- attr(*, "scaled:scale")= num 0.454
## $ zPercentSalaryHike : num [1:870, 1] -1.143 -0.326 -1.143 1.034 -0.599 ...
## ..- attr(*, "scaled:center")= num 15.2
## ..- attr(*, "scaled:scale")= num 3.68
## $ zPerformanceRating : num [1:870, 1] -0.423 -0.423 -0.423 -0.423 -0.423 ...
## ..- attr(*, "scaled:center")= num 3.15
## ..- attr(*, "scaled:scale")= num 0.359
## $ zRelationshipSatisfaction: num [1:870, 1] 0.266 -1.548 0.266 0.266 0.266 ...
## ..- attr(*, "scaled:center")= num 2.71
## ..- attr(*, "scaled:scale")= num 1.1
## $ zStockOptionLevel : num [1:870, 1] 0.252 -0.914 -0.914 1.418 -0.914 ...
## ..- attr(*, "scaled:center")= num 0.784
## ..- attr(*, "scaled:scale")= num 0.858
## $ zTotalWorkingYears : num [1:870, 1] -0.406 1.324 -0.14 0.392 -0.672 ...
## ..- attr(*, "scaled:center")= num 11.1
## ..- attr(*, "scaled:scale")= num 7.51
## $ zTrainingTimesLastYear : num [1:870, 1] 0.132 -0.654 -0.654 0.132 -0.654 ...
## ..- attr(*, "scaled:center")= num 2.83
## ..- attr(*, "scaled:scale")= num 1.27
## $ zWorkLifeBalance : num [1:870, 1] -1.098 1.711 0.307 0.307 0.307 ...
## ..- attr(*, "scaled:center")= num 2.78
## ..- attr(*, "scaled:scale")= num 0.712
## $ zYearsAtCompany : num [1:870, 1] -0.326 2.165 -0.824 1.169 -0.16 ...
## ..- attr(*, "scaled:center")= num 6.96
## ..- attr(*, "scaled:scale")= num 6.02
## $ zYearsInCurrentRole : num [1:870, 1] -0.606 0.768 -0.606 1.592 -0.331 ...
## ..- attr(*, "scaled:center")= num 4.2
## ..- attr(*, "scaled:scale")= num 3.64
## $ zYearsSinceLastPromotion : num [1:870, 1] -0.681 0.575 -0.053 0.889 -0.367 ...
## ..- attr(*, "scaled:center")= num 2.17
## ..- attr(*, "scaled:scale")= num 3.19
## $ zYearsWithCurrManager : num [1:870, 1] -0.319 1.36 -0.599 0.8 -0.319 ...
## ..- attr(*, "scaled:center")= num 4.14
## ..- attr(*, "scaled:scale")= num 3.57
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
The KNN model appears to work quite badly! The ROC is just about the same as a coin flip, and it seems no value of K can make the area under the curve any better, regardless of having scaled or not scaled the variables.
#KNN
##load the package class
library(class)
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
##run knn k=8 and make ROC--it looks really bad
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=8, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
plot(roc.perf_knn, colorize = TRUE,main = "ROC of KNN with k=8 on Original Scaled Data")
auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
##get auc of knn
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")
#all the KNN models are bad
#second time, this time with scaled variables
train <- split_train_test(employees_z)[[1]]
test <- split_train_test(employees_z)[[2]]
##run knn k=8 and make ROC--it still looks very bad
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=as.factor(train$Attrition), k=8, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
plot(roc.perf_knn, colorize = TRUE, main = "ROC of KNN with k=8 on Stardard Scaled Data")
auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
##get auc of knn for many ks
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")
#they're all still bad
Random Forest works well! At a probability threshold of .5 for classifying as “Yes”, the Random Forest model gets a .86 accuracy, .79 sensitivity, and .88 specificity. A comparison of its ROC curve verses the Naive Bayes model ROC curve shows that the Random Forest does better.
#RF
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
train <- split_train_test(employees)[[1]]
test <- split_train_test(employees)[[2]]
model.rf <- randomForest(Attrition ~ ., data = train, importance = TRUE)
preds.rf <- predict(model.rf, test)
confusionMatrix(table(preds.rf, test$Attrition), positive = "Yes")
## Confusion Matrix and Statistics
##
##
## preds.rf No Yes
## No 301 10
## Yes 64 60
##
## Accuracy : 0.8299
## 95% CI : (0.7912, 0.864)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.7247
##
## Kappa : 0.5198
##
## Mcnemar's Test P-Value : 7.223e-10
##
## Sensitivity : 0.8571
## Specificity : 0.8247
## Pos Pred Value : 0.4839
## Neg Pred Value : 0.9678
## Prevalence : 0.1609
## Detection Rate : 0.1379
## Detection Prevalence : 0.2851
## Balanced Accuracy : 0.8409
##
## 'Positive' Class : Yes
##
prob.rf <- predict(model.rf, test, type = "prob")
preds.rf <- prediction(prob.rf[,2], test$Attrition)
roc.rf = performance(preds.rf, measure = "tpr", x.measure = "fpr")
#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red", main = "ROC of Naive Bayes (red) and Random Forest (green)")
plot(roc.rf, add = TRUE, col="green")
plot(roc.rf, colorize=TRUE, main = "Colorized P Threshold ROC Curve for RF Model")
#Feature Importances
featureimportances <- importance(model.rf)
sort(featureimportances[,2], decreasing=TRUE)
## StockOptionLevel OverTime TotalWorkingYears
## 8.14252644 6.26200491 5.54516115
## Age MaritalStatus EmployeeNumber
## 4.03970821 3.86992022 3.75336027
## JobRole MonthlyIncome YearsInCurrentRole
## 3.37024640 3.08013146 3.06185519
## YearsWithCurrManager JobLevel YearsAtCompany
## 2.57541325 1.86835137 1.84369957
## DistanceFromHome RelationshipSatisfaction DailyRate
## 1.73567288 1.34891868 1.26372878
## YearsSinceLastPromotion MonthlyRate ID
## 0.88595166 0.64309859 0.50604789
## PercentSalaryHike JobInvolvement PerformanceRating
## 0.46243177 0.25634005 0.23417923
## EmployeeCount StandardHours EnvironmentSatisfaction
## 0.00000000 0.00000000 -0.07411999
## WorkLifeBalance Education Gender
## -0.09198926 -0.20645814 -0.32441724
## BusinessTravel NumCompaniesWorked Department
## -0.47272666 -0.58633799 -1.06224347
## TrainingTimesLastYear JobSatisfaction HourlyRate
## -1.16799244 -1.23977576 -1.47557284
## EducationField
## -2.48717681
Random forest wins as a predictive model of attrition. From my analysis, the top influencing factors for attrition in a company appear to be:
First, we can see that income has a right skew. What continuous variables have a difference in distribution of the Monthly Income? From the scatterplot matrices, the most noticeable correlations with income are:
library(GGally)
employees %>%
ggplot(aes(x=MonthlyIncome)) +
geom_histogram() +
ggtitle("Histogram of Monthly Income")
employees %>%
select_if(is.numeric) %>%
select(1:5) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(6:10) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
employees %>%
select_if(is.numeric) %>%
select(11:15) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(16:19) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
What categorical variables have a difference in distribution in response? Most are not very telling. But as mentioned in the exploration analysis above, one notable categorical variable that shows major differences in income is:
#there's 16 categorical variables
employees %>%
select_if(is.factor) %>%
dim()
## [1] 870 16
categs <- names(select_if(employees, is.factor))
for(i in 1:length(categs)){
print(
employees %>%
ggplot(aes(x = eval(parse(text=categs[i])), y=MonthlyIncome)) +
geom_boxplot() +
xlab(categs[i])
)
}
To model income, we will first try to use linear regression. I try a number of linear regression models shown below–a manual model, a full model with everything, a LASSO selected model without interaction terms, a LASSO selected model with interaction terms, and a forward-selection model including interaction terms. The one with lowest RMSE is as selected by forward selection, and its formula is:
MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + DailyRate + Gender + ID + MonthlyRate + Department + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel
manually selected variables based on EDA plots
#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]
#fit the manual model
lm.manual <- lm(MonthlyIncome ~ Age + EmployeeNumber + NumCompaniesWorked +
TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + Attrition + BusinessTravel + Department + Education + EducationField + JobRole + StockOptionLevel, data=train)
#check residuals and summary
plot(lm.manual)
summary(lm.manual)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + EmployeeNumber + NumCompaniesWorked +
## TotalWorkingYears + YearsAtCompany + YearsInCurrentRole +
## YearsSinceLastPromotion + Attrition + BusinessTravel + Department +
## Education + EducationField + JobRole + StockOptionLevel,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4404.3 -945.9 -78.3 882.6 4578.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.804e+03 1.066e+03 5.443 9.14e-08 ***
## Age -1.654e+00 1.210e+01 -0.137 0.8913
## EmployeeNumber -7.374e-02 1.286e-01 -0.573 0.5669
## NumCompaniesWorked -1.522e+01 3.770e+01 -0.404 0.6867
## TotalWorkingYears 2.000e+02 2.068e+01 9.670 < 2e-16 ***
## YearsAtCompany 4.120e+01 2.564e+01 1.607 0.1089
## YearsInCurrentRole -3.901e+01 3.350e+01 -1.164 0.2449
## YearsSinceLastPromotion 4.175e+01 3.200e+01 1.305 0.1928
## AttritionYes 2.406e+02 2.276e+02 1.057 0.2910
## BusinessTravelTravel_Frequently 5.792e+02 2.861e+02 2.024 0.0436 *
## BusinessTravelTravel_Rarely 5.664e+02 2.461e+02 2.301 0.0219 *
## DepartmentResearch & Development -8.636e+02 9.045e+02 -0.955 0.3403
## DepartmentSales -1.151e+03 8.888e+02 -1.295 0.1961
## EducationBelow College 6.558e+01 2.666e+02 0.246 0.8059
## EducationCollege -1.548e+02 2.011e+02 -0.770 0.4419
## EducationDoctor 2.981e+02 5.742e+02 0.519 0.6040
## EducationMaster -2.152e+02 1.957e+02 -1.099 0.2723
## EducationFieldLife Sciences -6.163e+02 7.443e+02 -0.828 0.4082
## EducationFieldMarketing -6.262e+02 7.951e+02 -0.788 0.4314
## EducationFieldMedical -7.978e+02 7.531e+02 -1.059 0.2901
## EducationFieldOther -3.658e+02 8.119e+02 -0.450 0.6526
## EducationFieldTechnical Degree -6.608e+02 7.725e+02 -0.855 0.3928
## JobRoleHuman Resources -4.319e+03 1.004e+03 -4.303 2.12e-05 ***
## JobRoleLaboratory Technician -2.979e+03 3.162e+02 -9.420 < 2e-16 ***
## JobRoleManager 7.470e+03 5.649e+02 13.225 < 2e-16 ***
## JobRoleManufacturing Director 5.473e+02 3.594e+02 1.523 0.1286
## JobRoleResearch Director 7.057e+03 4.462e+02 15.816 < 2e-16 ***
## JobRoleResearch Scientist -2.970e+03 3.091e+02 -9.609 < 2e-16 ***
## JobRoleSales Executive 9.776e+01 7.335e+02 0.133 0.8940
## JobRoleSales Representative -2.576e+03 7.833e+02 -3.288 0.0011 **
## StockOptionLevel1 -1.376e+02 1.702e+02 -0.808 0.4194
## StockOptionLevel2 -9.421e+01 2.852e+02 -0.330 0.7413
## StockOptionLevel3 -9.926e+01 3.202e+02 -0.310 0.7567
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1535 on 402 degrees of freedom
## Multiple R-squared: 0.896, Adjusted R-squared: 0.8877
## F-statistic: 108.2 on 32 and 402 DF, p-value: < 2.2e-16
#prediction error
preds.lm.manual <- predict(lm.manual, test)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.manual, test$MonthlyIncome)
## [1] 1762.216
full model
#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]
#fit the model
lm.full <- lm(MonthlyIncome ~ ., data=train)
#check residuals and summary
plot(lm.full)
summary(lm.full)
##
## Call:
## lm(formula = MonthlyIncome ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3499.0 -650.9 -40.4 526.6 4256.7
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.838e+02 1.096e+03 -0.715 0.47495
## ID -2.243e-01 2.191e-01 -1.024 0.30669
## Age 5.775e+00 8.511e+00 0.679 0.49787
## AttritionYes 3.459e+02 1.740e+02 1.989 0.04746 *
## BusinessTravelTravel_Frequently 2.237e+02 2.143e+02 1.044 0.29738
## BusinessTravelTravel_Rarely 4.170e+02 1.818e+02 2.294 0.02235 *
## DailyRate 2.381e-01 1.367e-01 1.741 0.08243 .
## DepartmentResearch & Development -1.253e+02 7.803e+02 -0.161 0.87255
## DepartmentSales -3.808e+02 7.846e+02 -0.485 0.62771
## DistanceFromHome -7.500e+00 6.843e+00 -1.096 0.27378
## EducationBelow College 7.217e+01 1.872e+02 0.385 0.70011
## EducationCollege 3.990e+01 1.507e+02 0.265 0.79126
## EducationDoctor -4.667e+02 3.301e+02 -1.414 0.15821
## EducationMaster 1.320e+02 1.431e+02 0.922 0.35688
## EducationFieldLife Sciences 7.894e+01 5.224e+02 0.151 0.87997
## EducationFieldMarketing -7.654e+01 5.552e+02 -0.138 0.89044
## EducationFieldMedical -9.258e+00 5.203e+02 -0.018 0.98581
## EducationFieldOther 2.064e+02 5.583e+02 0.370 0.71177
## EducationFieldTechnical Degree 2.472e+01 5.488e+02 0.045 0.96410
## EmployeeCount NA NA NA NA
## EmployeeNumber 6.044e-02 9.072e-02 0.666 0.50567
## EnvironmentSatisfactionLow -1.669e+02 1.654e+02 -1.010 0.31337
## EnvironmentSatisfactionMedium -2.356e+02 1.600e+02 -1.472 0.14186
## EnvironmentSatisfactionVery High -3.958e+02 1.416e+02 -2.795 0.00547 **
## GenderMale 2.281e+01 1.120e+02 0.204 0.83878
## HourlyRate 1.378e+00 2.826e+00 0.488 0.62603
## JobInvolvementLow 4.133e+01 2.436e+02 0.170 0.86535
## JobInvolvementMedium -2.614e+02 1.294e+02 -2.020 0.04405 *
## JobInvolvementVery High 2.289e+02 1.935e+02 1.183 0.23741
## JobLevel 2.832e+03 1.284e+02 22.063 < 2e-16 ***
## JobRoleHuman Resources -6.887e+02 8.452e+02 -0.815 0.41568
## JobRoleLaboratory Technician -5.224e+02 2.628e+02 -1.988 0.04755 *
## JobRoleManager 4.151e+03 4.763e+02 8.715 < 2e-16 ***
## JobRoleManufacturing Director 1.121e+02 2.556e+02 0.438 0.66129
## JobRoleResearch Director 4.141e+03 3.385e+02 12.234 < 2e-16 ***
## JobRoleResearch Scientist -3.459e+02 2.613e+02 -1.324 0.18643
## JobRoleSales Executive 1.708e+02 5.868e+02 0.291 0.77124
## JobRoleSales Representative -1.949e+02 6.112e+02 -0.319 0.75000
## JobSatisfactionLow -3.991e+01 1.618e+02 -0.247 0.80526
## JobSatisfactionMedium 1.565e+01 1.667e+02 0.094 0.92528
## JobSatisfactionVery High 1.709e+02 1.457e+02 1.173 0.24162
## MaritalStatusMarried 3.447e+01 1.597e+02 0.216 0.82917
## MaritalStatusSingle -1.038e+02 2.418e+02 -0.429 0.66817
## MonthlyRate -1.148e-02 7.862e-03 -1.460 0.14509
## NumCompaniesWorked 2.612e+01 2.540e+01 1.028 0.30444
## OverTimeYes -7.022e+01 1.218e+02 -0.577 0.56452
## PercentSalaryHike 1.323e+01 2.413e+01 0.548 0.58385
## PerformanceRatingOutstanding -2.408e+02 2.480e+02 -0.971 0.33213
## RelationshipSatisfactionLow -1.485e+02 1.622e+02 -0.916 0.36032
## RelationshipSatisfactionMedium -2.255e+01 1.569e+02 -0.144 0.88577
## RelationshipSatisfactionVery High -4.325e+01 1.398e+02 -0.309 0.75715
## StandardHours NA NA NA NA
## StockOptionLevel1 6.359e+01 1.908e+02 0.333 0.73916
## StockOptionLevel2 2.731e+01 2.492e+02 0.110 0.91280
## StockOptionLevel3 -4.587e+01 2.652e+02 -0.173 0.86281
## TotalWorkingYears 3.123e+01 1.725e+01 1.811 0.07098 .
## TrainingTimesLastYear 3.497e+01 4.352e+01 0.804 0.42217
## WorkLifeBalanceBest 5.764e+01 2.847e+02 0.202 0.83968
## WorkLifeBalanceBetter 1.325e+02 2.396e+02 0.553 0.58066
## WorkLifeBalanceGood 2.502e+01 2.548e+02 0.098 0.92184
## YearsAtCompany 2.013e+00 2.332e+01 0.086 0.93128
## YearsInCurrentRole -1.683e+01 2.552e+01 -0.660 0.50989
## YearsSinceLastPromotion 3.665e+01 2.494e+01 1.469 0.14262
## YearsWithCurrManager -1.600e+00 3.022e+01 -0.053 0.95782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1071 on 374 degrees of freedom
## Multiple R-squared: 0.9503, Adjusted R-squared: 0.9422
## F-statistic: 117.2 on 61 and 374 DF, p-value: < 2.2e-16
#prediction error
preds.lm.full <- predict(lm.full, test)
## Warning in predict.lm(lm.full, test): prediction from a rank-deficient fit may
## be misleading
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.full, test$MonthlyIncome)
## [1] 1108.865
LASSO
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 3.0-2
#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]
#find the lasso variable choices
x_vars <- model.matrix(MonthlyIncome ~ (Age + EmployeeNumber + NumCompaniesWorked +
TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + Attrition + BusinessTravel + Department + Education + EducationField + JobRole + StockOptionLevel)^2 + . , employees)[,-1]
cvfit <- cv.glmnet(x_vars, employees$MonthlyIncome)
coef(cvfit, s = "lambda.1se")
## 511 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -503.790534
## Age .
## EmployeeNumber .
## NumCompaniesWorked .
## TotalWorkingYears 33.119295
## YearsAtCompany .
## YearsInCurrentRole .
## YearsSinceLastPromotion .
## AttritionYes .
## BusinessTravelTravel_Frequently .
## BusinessTravelTravel_Rarely .
## DepartmentResearch & Development .
## DepartmentSales .
## EducationBelow College .
## EducationCollege .
## EducationDoctor .
## EducationMaster .
## EducationFieldLife Sciences .
## EducationFieldMarketing .
## EducationFieldMedical .
## EducationFieldOther .
## EducationFieldTechnical Degree .
## JobRoleHuman Resources .
## JobRoleLaboratory Technician .
## JobRoleManager 2992.187743
## JobRoleManufacturing Director .
## JobRoleResearch Director 3094.684910
## JobRoleResearch Scientist .
## JobRoleSales Executive .
## JobRoleSales Representative .
## StockOptionLevel1 .
## StockOptionLevel2 .
## StockOptionLevel3 .
## ID .
## DailyRate .
## DistanceFromHome .
## EmployeeCount .
## EnvironmentSatisfactionLow .
## EnvironmentSatisfactionMedium .
## EnvironmentSatisfactionVery High .
## GenderMale .
## HourlyRate .
## JobInvolvementLow .
## JobInvolvementMedium .
## JobInvolvementVery High .
## JobLevel 3006.536914
## JobSatisfactionLow .
## JobSatisfactionMedium .
## JobSatisfactionVery High .
## MaritalStatusMarried .
## MaritalStatusSingle .
## MonthlyRate .
## OverTimeYes .
## PercentSalaryHike .
## PerformanceRatingOutstanding .
## RelationshipSatisfactionLow .
## RelationshipSatisfactionMedium .
## RelationshipSatisfactionVery High .
## StandardHours .
## TrainingTimesLastYear .
## WorkLifeBalanceBest .
## WorkLifeBalanceBetter .
## WorkLifeBalanceGood .
## YearsWithCurrManager .
## Age:EmployeeNumber .
## Age:NumCompaniesWorked .
## Age:TotalWorkingYears .
## Age:YearsAtCompany .
## Age:YearsInCurrentRole .
## Age:YearsSinceLastPromotion .
## Age:AttritionYes .
## Age:BusinessTravelTravel_Frequently .
## Age:BusinessTravelTravel_Rarely .
## Age:DepartmentResearch & Development .
## Age:DepartmentSales .
## Age:EducationBelow College .
## Age:EducationCollege .
## Age:EducationDoctor .
## Age:EducationMaster .
## Age:EducationFieldLife Sciences .
## Age:EducationFieldMarketing .
## Age:EducationFieldMedical .
## Age:EducationFieldOther .
## Age:EducationFieldTechnical Degree .
## Age:JobRoleHuman Resources .
## Age:JobRoleLaboratory Technician .
## Age:JobRoleManager .
## Age:JobRoleManufacturing Director .
## Age:JobRoleResearch Director .
## Age:JobRoleResearch Scientist .
## Age:JobRoleSales Executive .
## Age:JobRoleSales Representative .
## Age:StockOptionLevel1 .
## Age:StockOptionLevel2 .
## Age:StockOptionLevel3 .
## EmployeeNumber:NumCompaniesWorked .
## EmployeeNumber:TotalWorkingYears .
## EmployeeNumber:YearsAtCompany .
## EmployeeNumber:YearsInCurrentRole .
## EmployeeNumber:YearsSinceLastPromotion .
## EmployeeNumber:AttritionYes .
## EmployeeNumber:BusinessTravelTravel_Frequently .
## EmployeeNumber:BusinessTravelTravel_Rarely .
## EmployeeNumber:DepartmentResearch & Development .
## EmployeeNumber:DepartmentSales .
## EmployeeNumber:EducationBelow College .
## EmployeeNumber:EducationCollege .
## EmployeeNumber:EducationDoctor .
## EmployeeNumber:EducationMaster .
## EmployeeNumber:EducationFieldLife Sciences .
## EmployeeNumber:EducationFieldMarketing .
## EmployeeNumber:EducationFieldMedical .
## EmployeeNumber:EducationFieldOther .
## EmployeeNumber:EducationFieldTechnical Degree .
## EmployeeNumber:JobRoleHuman Resources .
## EmployeeNumber:JobRoleLaboratory Technician .
## EmployeeNumber:JobRoleManager .
## EmployeeNumber:JobRoleManufacturing Director .
## EmployeeNumber:JobRoleResearch Director .
## EmployeeNumber:JobRoleResearch Scientist .
## EmployeeNumber:JobRoleSales Executive .
## EmployeeNumber:JobRoleSales Representative .
## EmployeeNumber:StockOptionLevel1 .
## EmployeeNumber:StockOptionLevel2 .
## EmployeeNumber:StockOptionLevel3 .
## NumCompaniesWorked:TotalWorkingYears .
## NumCompaniesWorked:YearsAtCompany .
## NumCompaniesWorked:YearsInCurrentRole .
## NumCompaniesWorked:YearsSinceLastPromotion .
## NumCompaniesWorked:AttritionYes .
## NumCompaniesWorked:BusinessTravelTravel_Frequently .
## NumCompaniesWorked:BusinessTravelTravel_Rarely .
## NumCompaniesWorked:DepartmentResearch & Development .
## NumCompaniesWorked:DepartmentSales .
## NumCompaniesWorked:EducationBelow College .
## NumCompaniesWorked:EducationCollege .
## NumCompaniesWorked:EducationDoctor .
## NumCompaniesWorked:EducationMaster .
## NumCompaniesWorked:EducationFieldLife Sciences .
## NumCompaniesWorked:EducationFieldMarketing .
## NumCompaniesWorked:EducationFieldMedical .
## NumCompaniesWorked:EducationFieldOther .
## NumCompaniesWorked:EducationFieldTechnical Degree .
## NumCompaniesWorked:JobRoleHuman Resources .
## NumCompaniesWorked:JobRoleLaboratory Technician .
## NumCompaniesWorked:JobRoleManager .
## NumCompaniesWorked:JobRoleManufacturing Director .
## NumCompaniesWorked:JobRoleResearch Director .
## NumCompaniesWorked:JobRoleResearch Scientist .
## NumCompaniesWorked:JobRoleSales Executive .
## NumCompaniesWorked:JobRoleSales Representative .
## NumCompaniesWorked:StockOptionLevel1 .
## NumCompaniesWorked:StockOptionLevel2 .
## NumCompaniesWorked:StockOptionLevel3 .
## TotalWorkingYears:YearsAtCompany .
## TotalWorkingYears:YearsInCurrentRole .
## TotalWorkingYears:YearsSinceLastPromotion .
## TotalWorkingYears:AttritionYes .
## TotalWorkingYears:BusinessTravelTravel_Frequently .
## TotalWorkingYears:BusinessTravelTravel_Rarely 9.465865
## TotalWorkingYears:DepartmentResearch & Development .
## TotalWorkingYears:DepartmentSales .
## TotalWorkingYears:EducationBelow College .
## TotalWorkingYears:EducationCollege .
## TotalWorkingYears:EducationDoctor .
## TotalWorkingYears:EducationMaster .
## TotalWorkingYears:EducationFieldLife Sciences .
## TotalWorkingYears:EducationFieldMarketing .
## TotalWorkingYears:EducationFieldMedical .
## TotalWorkingYears:EducationFieldOther .
## TotalWorkingYears:EducationFieldTechnical Degree .
## TotalWorkingYears:JobRoleHuman Resources .
## TotalWorkingYears:JobRoleLaboratory Technician -28.574525
## TotalWorkingYears:JobRoleManager .
## TotalWorkingYears:JobRoleManufacturing Director .
## TotalWorkingYears:JobRoleResearch Director .
## TotalWorkingYears:JobRoleResearch Scientist .
## TotalWorkingYears:JobRoleSales Executive .
## TotalWorkingYears:JobRoleSales Representative .
## TotalWorkingYears:StockOptionLevel1 .
## TotalWorkingYears:StockOptionLevel2 .
## TotalWorkingYears:StockOptionLevel3 .
## YearsAtCompany:YearsInCurrentRole .
## YearsAtCompany:YearsSinceLastPromotion .
## YearsAtCompany:AttritionYes .
## YearsAtCompany:BusinessTravelTravel_Frequently .
## YearsAtCompany:BusinessTravelTravel_Rarely .
## YearsAtCompany:DepartmentResearch & Development .
## YearsAtCompany:DepartmentSales .
## YearsAtCompany:EducationBelow College .
## YearsAtCompany:EducationCollege .
## YearsAtCompany:EducationDoctor .
## YearsAtCompany:EducationMaster .
## YearsAtCompany:EducationFieldLife Sciences .
## YearsAtCompany:EducationFieldMarketing .
## YearsAtCompany:EducationFieldMedical .
## YearsAtCompany:EducationFieldOther .
## YearsAtCompany:EducationFieldTechnical Degree .
## YearsAtCompany:JobRoleHuman Resources .
## YearsAtCompany:JobRoleLaboratory Technician .
## YearsAtCompany:JobRoleManager 5.101132
## YearsAtCompany:JobRoleManufacturing Director .
## YearsAtCompany:JobRoleResearch Director .
## YearsAtCompany:JobRoleResearch Scientist .
## YearsAtCompany:JobRoleSales Executive .
## YearsAtCompany:JobRoleSales Representative .
## YearsAtCompany:StockOptionLevel1 .
## YearsAtCompany:StockOptionLevel2 .
## YearsAtCompany:StockOptionLevel3 .
## YearsInCurrentRole:YearsSinceLastPromotion .
## YearsInCurrentRole:AttritionYes .
## YearsInCurrentRole:BusinessTravelTravel_Frequently .
## YearsInCurrentRole:BusinessTravelTravel_Rarely .
## YearsInCurrentRole:DepartmentResearch & Development .
## YearsInCurrentRole:DepartmentSales .
## YearsInCurrentRole:EducationBelow College .
## YearsInCurrentRole:EducationCollege .
## YearsInCurrentRole:EducationDoctor .
## YearsInCurrentRole:EducationMaster .
## YearsInCurrentRole:EducationFieldLife Sciences .
## YearsInCurrentRole:EducationFieldMarketing .
## YearsInCurrentRole:EducationFieldMedical .
## YearsInCurrentRole:EducationFieldOther .
## YearsInCurrentRole:EducationFieldTechnical Degree .
## YearsInCurrentRole:JobRoleHuman Resources .
## YearsInCurrentRole:JobRoleLaboratory Technician .
## YearsInCurrentRole:JobRoleManager .
## YearsInCurrentRole:JobRoleManufacturing Director .
## YearsInCurrentRole:JobRoleResearch Director .
## YearsInCurrentRole:JobRoleResearch Scientist .
## YearsInCurrentRole:JobRoleSales Executive .
## YearsInCurrentRole:JobRoleSales Representative .
## YearsInCurrentRole:StockOptionLevel1 .
## YearsInCurrentRole:StockOptionLevel2 .
## YearsInCurrentRole:StockOptionLevel3 .
## YearsSinceLastPromotion:AttritionYes .
## YearsSinceLastPromotion:BusinessTravelTravel_Frequently .
## YearsSinceLastPromotion:BusinessTravelTravel_Rarely .
## YearsSinceLastPromotion:DepartmentResearch & Development .
## YearsSinceLastPromotion:DepartmentSales .
## YearsSinceLastPromotion:EducationBelow College .
## YearsSinceLastPromotion:EducationCollege .
## YearsSinceLastPromotion:EducationDoctor .
## YearsSinceLastPromotion:EducationMaster .
## YearsSinceLastPromotion:EducationFieldLife Sciences .
## YearsSinceLastPromotion:EducationFieldMarketing .
## YearsSinceLastPromotion:EducationFieldMedical .
## YearsSinceLastPromotion:EducationFieldOther .
## YearsSinceLastPromotion:EducationFieldTechnical Degree .
## YearsSinceLastPromotion:JobRoleHuman Resources .
## YearsSinceLastPromotion:JobRoleLaboratory Technician .
## YearsSinceLastPromotion:JobRoleManager .
## YearsSinceLastPromotion:JobRoleManufacturing Director .
## YearsSinceLastPromotion:JobRoleResearch Director .
## YearsSinceLastPromotion:JobRoleResearch Scientist .
## YearsSinceLastPromotion:JobRoleSales Executive .
## YearsSinceLastPromotion:JobRoleSales Representative .
## YearsSinceLastPromotion:StockOptionLevel1 .
## YearsSinceLastPromotion:StockOptionLevel2 .
## YearsSinceLastPromotion:StockOptionLevel3 .
## AttritionYes:BusinessTravelTravel_Frequently .
## AttritionYes:BusinessTravelTravel_Rarely .
## AttritionYes:DepartmentResearch & Development .
## AttritionYes:DepartmentSales .
## AttritionYes:EducationBelow College .
## AttritionYes:EducationCollege .
## AttritionYes:EducationDoctor .
## AttritionYes:EducationMaster .
## AttritionYes:EducationFieldLife Sciences .
## AttritionYes:EducationFieldMarketing .
## AttritionYes:EducationFieldMedical .
## AttritionYes:EducationFieldOther .
## AttritionYes:EducationFieldTechnical Degree .
## AttritionYes:JobRoleHuman Resources .
## AttritionYes:JobRoleLaboratory Technician .
## AttritionYes:JobRoleManager .
## AttritionYes:JobRoleManufacturing Director .
## AttritionYes:JobRoleResearch Director .
## AttritionYes:JobRoleResearch Scientist .
## AttritionYes:JobRoleSales Executive .
## AttritionYes:JobRoleSales Representative .
## AttritionYes:StockOptionLevel1 .
## AttritionYes:StockOptionLevel2 .
## AttritionYes:StockOptionLevel3 .
## BusinessTravelTravel_Frequently:DepartmentResearch & Development .
## BusinessTravelTravel_Rarely:DepartmentResearch & Development .
## BusinessTravelTravel_Frequently:DepartmentSales .
## BusinessTravelTravel_Rarely:DepartmentSales .
## BusinessTravelTravel_Frequently:EducationBelow College .
## BusinessTravelTravel_Rarely:EducationBelow College .
## BusinessTravelTravel_Frequently:EducationCollege .
## BusinessTravelTravel_Rarely:EducationCollege .
## BusinessTravelTravel_Frequently:EducationDoctor .
## BusinessTravelTravel_Rarely:EducationDoctor .
## BusinessTravelTravel_Frequently:EducationMaster .
## BusinessTravelTravel_Rarely:EducationMaster .
## BusinessTravelTravel_Frequently:EducationFieldLife Sciences .
## BusinessTravelTravel_Rarely:EducationFieldLife Sciences .
## BusinessTravelTravel_Frequently:EducationFieldMarketing .
## BusinessTravelTravel_Rarely:EducationFieldMarketing .
## BusinessTravelTravel_Frequently:EducationFieldMedical .
## BusinessTravelTravel_Rarely:EducationFieldMedical .
## BusinessTravelTravel_Frequently:EducationFieldOther .
## BusinessTravelTravel_Rarely:EducationFieldOther .
## BusinessTravelTravel_Frequently:EducationFieldTechnical Degree .
## BusinessTravelTravel_Rarely:EducationFieldTechnical Degree .
## BusinessTravelTravel_Frequently:JobRoleHuman Resources .
## BusinessTravelTravel_Rarely:JobRoleHuman Resources .
## BusinessTravelTravel_Frequently:JobRoleLaboratory Technician .
## BusinessTravelTravel_Rarely:JobRoleLaboratory Technician .
## BusinessTravelTravel_Frequently:JobRoleManager .
## BusinessTravelTravel_Rarely:JobRoleManager .
## BusinessTravelTravel_Frequently:JobRoleManufacturing Director .
## BusinessTravelTravel_Rarely:JobRoleManufacturing Director .
## BusinessTravelTravel_Frequently:JobRoleResearch Director .
## BusinessTravelTravel_Rarely:JobRoleResearch Director .
## BusinessTravelTravel_Frequently:JobRoleResearch Scientist .
## BusinessTravelTravel_Rarely:JobRoleResearch Scientist .
## BusinessTravelTravel_Frequently:JobRoleSales Executive .
## BusinessTravelTravel_Rarely:JobRoleSales Executive .
## BusinessTravelTravel_Frequently:JobRoleSales Representative .
## BusinessTravelTravel_Rarely:JobRoleSales Representative .
## BusinessTravelTravel_Frequently:StockOptionLevel1 .
## BusinessTravelTravel_Rarely:StockOptionLevel1 .
## BusinessTravelTravel_Frequently:StockOptionLevel2 .
## BusinessTravelTravel_Rarely:StockOptionLevel2 .
## BusinessTravelTravel_Frequently:StockOptionLevel3 .
## BusinessTravelTravel_Rarely:StockOptionLevel3 .
## DepartmentResearch & Development:EducationBelow College .
## DepartmentSales:EducationBelow College .
## DepartmentResearch & Development:EducationCollege .
## DepartmentSales:EducationCollege .
## DepartmentResearch & Development:EducationDoctor .
## DepartmentSales:EducationDoctor .
## DepartmentResearch & Development:EducationMaster .
## DepartmentSales:EducationMaster .
## DepartmentResearch & Development:EducationFieldLife Sciences .
## DepartmentSales:EducationFieldLife Sciences .
## DepartmentResearch & Development:EducationFieldMarketing .
## DepartmentSales:EducationFieldMarketing .
## DepartmentResearch & Development:EducationFieldMedical .
## DepartmentSales:EducationFieldMedical .
## DepartmentResearch & Development:EducationFieldOther .
## DepartmentSales:EducationFieldOther .
## DepartmentResearch & Development:EducationFieldTechnical Degree .
## DepartmentSales:EducationFieldTechnical Degree .
## DepartmentResearch & Development:JobRoleHuman Resources .
## DepartmentSales:JobRoleHuman Resources .
## DepartmentResearch & Development:JobRoleLaboratory Technician .
## DepartmentSales:JobRoleLaboratory Technician .
## DepartmentResearch & Development:JobRoleManager .
## DepartmentSales:JobRoleManager .
## DepartmentResearch & Development:JobRoleManufacturing Director .
## DepartmentSales:JobRoleManufacturing Director .
## DepartmentResearch & Development:JobRoleResearch Director 17.924932
## DepartmentSales:JobRoleResearch Director .
## DepartmentResearch & Development:JobRoleResearch Scientist .
## DepartmentSales:JobRoleResearch Scientist .
## DepartmentResearch & Development:JobRoleSales Executive .
## DepartmentSales:JobRoleSales Executive .
## DepartmentResearch & Development:JobRoleSales Representative .
## DepartmentSales:JobRoleSales Representative .
## DepartmentResearch & Development:StockOptionLevel1 .
## DepartmentSales:StockOptionLevel1 .
## DepartmentResearch & Development:StockOptionLevel2 .
## DepartmentSales:StockOptionLevel2 .
## DepartmentResearch & Development:StockOptionLevel3 .
## DepartmentSales:StockOptionLevel3 .
## EducationBelow College:EducationFieldLife Sciences .
## EducationCollege:EducationFieldLife Sciences .
## EducationDoctor:EducationFieldLife Sciences .
## EducationMaster:EducationFieldLife Sciences .
## EducationBelow College:EducationFieldMarketing .
## EducationCollege:EducationFieldMarketing .
## EducationDoctor:EducationFieldMarketing .
## EducationMaster:EducationFieldMarketing .
## EducationBelow College:EducationFieldMedical .
## EducationCollege:EducationFieldMedical .
## EducationDoctor:EducationFieldMedical .
## EducationMaster:EducationFieldMedical .
## EducationBelow College:EducationFieldOther .
## EducationCollege:EducationFieldOther .
## EducationDoctor:EducationFieldOther .
## EducationMaster:EducationFieldOther .
## EducationBelow College:EducationFieldTechnical Degree .
## EducationCollege:EducationFieldTechnical Degree .
## EducationDoctor:EducationFieldTechnical Degree .
## EducationMaster:EducationFieldTechnical Degree .
## EducationBelow College:JobRoleHuman Resources .
## EducationCollege:JobRoleHuman Resources .
## EducationDoctor:JobRoleHuman Resources .
## EducationMaster:JobRoleHuman Resources .
## EducationBelow College:JobRoleLaboratory Technician .
## EducationCollege:JobRoleLaboratory Technician .
## EducationDoctor:JobRoleLaboratory Technician .
## EducationMaster:JobRoleLaboratory Technician .
## EducationBelow College:JobRoleManager .
## EducationCollege:JobRoleManager .
## EducationDoctor:JobRoleManager .
## EducationMaster:JobRoleManager .
## EducationBelow College:JobRoleManufacturing Director .
## EducationCollege:JobRoleManufacturing Director .
## EducationDoctor:JobRoleManufacturing Director .
## EducationMaster:JobRoleManufacturing Director .
## EducationBelow College:JobRoleResearch Director .
## EducationCollege:JobRoleResearch Director .
## EducationDoctor:JobRoleResearch Director .
## EducationMaster:JobRoleResearch Director .
## EducationBelow College:JobRoleResearch Scientist .
## EducationCollege:JobRoleResearch Scientist .
## EducationDoctor:JobRoleResearch Scientist .
## EducationMaster:JobRoleResearch Scientist .
## EducationBelow College:JobRoleSales Executive .
## EducationCollege:JobRoleSales Executive .
## EducationDoctor:JobRoleSales Executive .
## EducationMaster:JobRoleSales Executive .
## EducationBelow College:JobRoleSales Representative .
## EducationCollege:JobRoleSales Representative .
## EducationDoctor:JobRoleSales Representative .
## EducationMaster:JobRoleSales Representative .
## EducationBelow College:StockOptionLevel1 .
## EducationCollege:StockOptionLevel1 .
## EducationDoctor:StockOptionLevel1 .
## EducationMaster:StockOptionLevel1 .
## EducationBelow College:StockOptionLevel2 .
## EducationCollege:StockOptionLevel2 .
## EducationDoctor:StockOptionLevel2 .
## EducationMaster:StockOptionLevel2 .
## EducationBelow College:StockOptionLevel3 .
## EducationCollege:StockOptionLevel3 .
## EducationDoctor:StockOptionLevel3 .
## EducationMaster:StockOptionLevel3 .
## EducationFieldLife Sciences:JobRoleHuman Resources .
## EducationFieldMarketing:JobRoleHuman Resources .
## EducationFieldMedical:JobRoleHuman Resources .
## EducationFieldOther:JobRoleHuman Resources .
## EducationFieldTechnical Degree:JobRoleHuman Resources .
## EducationFieldLife Sciences:JobRoleLaboratory Technician .
## EducationFieldMarketing:JobRoleLaboratory Technician .
## EducationFieldMedical:JobRoleLaboratory Technician .
## EducationFieldOther:JobRoleLaboratory Technician .
## EducationFieldTechnical Degree:JobRoleLaboratory Technician .
## EducationFieldLife Sciences:JobRoleManager .
## EducationFieldMarketing:JobRoleManager .
## EducationFieldMedical:JobRoleManager .
## EducationFieldOther:JobRoleManager .
## EducationFieldTechnical Degree:JobRoleManager .
## EducationFieldLife Sciences:JobRoleManufacturing Director .
## EducationFieldMarketing:JobRoleManufacturing Director .
## EducationFieldMedical:JobRoleManufacturing Director .
## EducationFieldOther:JobRoleManufacturing Director .
## EducationFieldTechnical Degree:JobRoleManufacturing Director .
## EducationFieldLife Sciences:JobRoleResearch Director .
## EducationFieldMarketing:JobRoleResearch Director .
## EducationFieldMedical:JobRoleResearch Director .
## EducationFieldOther:JobRoleResearch Director .
## EducationFieldTechnical Degree:JobRoleResearch Director .
## EducationFieldLife Sciences:JobRoleResearch Scientist .
## EducationFieldMarketing:JobRoleResearch Scientist .
## EducationFieldMedical:JobRoleResearch Scientist .
## EducationFieldOther:JobRoleResearch Scientist .
## EducationFieldTechnical Degree:JobRoleResearch Scientist .
## EducationFieldLife Sciences:JobRoleSales Executive .
## EducationFieldMarketing:JobRoleSales Executive .
## EducationFieldMedical:JobRoleSales Executive .
## EducationFieldOther:JobRoleSales Executive .
## EducationFieldTechnical Degree:JobRoleSales Executive .
## EducationFieldLife Sciences:JobRoleSales Representative .
## EducationFieldMarketing:JobRoleSales Representative .
## EducationFieldMedical:JobRoleSales Representative .
## EducationFieldOther:JobRoleSales Representative .
## EducationFieldTechnical Degree:JobRoleSales Representative .
## EducationFieldLife Sciences:StockOptionLevel1 .
## EducationFieldMarketing:StockOptionLevel1 .
## EducationFieldMedical:StockOptionLevel1 .
## EducationFieldOther:StockOptionLevel1 .
## EducationFieldTechnical Degree:StockOptionLevel1 .
## EducationFieldLife Sciences:StockOptionLevel2 .
## EducationFieldMarketing:StockOptionLevel2 .
## EducationFieldMedical:StockOptionLevel2 .
## EducationFieldOther:StockOptionLevel2 .
## EducationFieldTechnical Degree:StockOptionLevel2 .
## EducationFieldLife Sciences:StockOptionLevel3 .
## EducationFieldMarketing:StockOptionLevel3 .
## EducationFieldMedical:StockOptionLevel3 .
## EducationFieldOther:StockOptionLevel3 .
## EducationFieldTechnical Degree:StockOptionLevel3 .
## JobRoleHuman Resources:StockOptionLevel1 .
## JobRoleLaboratory Technician:StockOptionLevel1 .
## JobRoleManager:StockOptionLevel1 .
## JobRoleManufacturing Director:StockOptionLevel1 .
## JobRoleResearch Director:StockOptionLevel1 .
## JobRoleResearch Scientist:StockOptionLevel1 .
## JobRoleSales Executive:StockOptionLevel1 .
## JobRoleSales Representative:StockOptionLevel1 .
## JobRoleHuman Resources:StockOptionLevel2 .
## JobRoleLaboratory Technician:StockOptionLevel2 .
## JobRoleManager:StockOptionLevel2 .
## JobRoleManufacturing Director:StockOptionLevel2 .
## JobRoleResearch Director:StockOptionLevel2 .
## JobRoleResearch Scientist:StockOptionLevel2 .
## JobRoleSales Executive:StockOptionLevel2 .
## JobRoleSales Representative:StockOptionLevel2 .
## JobRoleHuman Resources:StockOptionLevel3 .
## JobRoleLaboratory Technician:StockOptionLevel3 .
## JobRoleManager:StockOptionLevel3 .
## JobRoleManufacturing Director:StockOptionLevel3 .
## JobRoleResearch Director:StockOptionLevel3 .
## JobRoleResearch Scientist:StockOptionLevel3 .
## JobRoleSales Executive:StockOptionLevel3 .
## JobRoleSales Representative:StockOptionLevel3 .
#fit the model
lm.lasso <- lm(MonthlyIncome ~ Attrition+BusinessTravel+ DistanceFromHome + Education + EmployeeNumber + EnvironmentSatisfaction + Gender + JobLevel + JobRole + MonthlyRate + RelationshipSatisfaction+ TotalWorkingYears + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data=train)
lm.lasso2 <- lm(MonthlyIncome ~ TotalWorkingYears+ JobRole+JobLevel + TotalWorkingYears:BusinessTravel + TotalWorkingYears:JobRole + YearsAtCompany:JobRole + Department:JobRole, data=train)
#check residuals and summary
plot(lm.lasso)
summary(lm.lasso)
##
## Call:
## lm(formula = MonthlyIncome ~ Attrition + BusinessTravel + DistanceFromHome +
## Education + EmployeeNumber + EnvironmentSatisfaction + Gender +
## JobLevel + JobRole + MonthlyRate + RelationshipSatisfaction +
## TotalWorkingYears + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3773.6 -629.7 5.4 668.4 3666.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.070e+01 3.818e+02 0.211 0.832704
## AttritionYes 1.227e+02 1.454e+02 0.843 0.399488
## BusinessTravelTravel_Frequently 2.120e+02 2.051e+02 1.034 0.301807
## BusinessTravelTravel_Rarely 4.585e+02 1.775e+02 2.583 0.010158 *
## DistanceFromHome -4.701e+00 6.535e+00 -0.719 0.472356
## EducationBelow College 2.413e+02 1.757e+02 1.374 0.170288
## EducationCollege -1.555e+02 1.456e+02 -1.068 0.286220
## EducationDoctor -5.355e+02 3.027e+02 -1.769 0.077660 .
## EducationMaster 7.089e+01 1.300e+02 0.546 0.585691
## EmployeeNumber 3.675e-04 8.626e-02 0.004 0.996603
## EnvironmentSatisfactionLow -2.235e+02 1.524e+02 -1.467 0.143206
## EnvironmentSatisfactionMedium -1.829e+01 1.468e+02 -0.125 0.900894
## EnvironmentSatisfactionVery High -3.345e+01 1.371e+02 -0.244 0.807332
## GenderMale 6.623e+01 1.051e+02 0.630 0.529111
## JobLevel 2.706e+03 1.150e+02 23.526 < 2e-16 ***
## JobRoleHuman Resources -8.127e+02 4.179e+02 -1.945 0.052525 .
## JobRoleLaboratory Technician -1.048e+03 2.394e+02 -4.375 1.55e-05 ***
## JobRoleManager 3.788e+03 3.119e+02 12.145 < 2e-16 ***
## JobRoleManufacturing Director 7.162e+01 2.456e+02 0.292 0.770704
## JobRoleResearch Director 4.075e+03 3.094e+02 13.169 < 2e-16 ***
## JobRoleResearch Scientist -6.777e+02 2.441e+02 -2.776 0.005763 **
## JobRoleSales Executive -4.536e+02 2.096e+02 -2.164 0.031054 *
## JobRoleSales Representative -1.006e+03 3.341e+02 -3.011 0.002771 **
## MonthlyRate -2.455e-03 7.469e-03 -0.329 0.742538
## RelationshipSatisfactionLow 9.558e+00 1.505e+02 0.063 0.949400
## RelationshipSatisfactionMedium 1.257e+02 1.498e+02 0.839 0.401803
## RelationshipSatisfactionVery High -1.218e+01 1.329e+02 -0.092 0.927018
## TotalWorkingYears 4.643e+01 1.225e+01 3.790 0.000174 ***
## YearsInCurrentRole 2.307e+01 2.097e+01 1.100 0.271834
## YearsSinceLastPromotion 1.928e+01 1.993e+01 0.968 0.333856
## YearsWithCurrManager -2.682e+01 1.987e+01 -1.350 0.177801
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1045 on 403 degrees of freedom
## Multiple R-squared: 0.9547, Adjusted R-squared: 0.9513
## F-statistic: 282.8 on 30 and 403 DF, p-value: < 2.2e-16
plot(lm.lasso2)
summary(lm.lasso2)
##
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobRole + JobLevel +
## TotalWorkingYears:BusinessTravel + TotalWorkingYears:JobRole +
## YearsAtCompany:JobRole + Department:JobRole, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2907.2 -599.2 -52.9 637.4 4082.9
##
## Coefficients: (16 not defined because of singularities)
## Estimate
## (Intercept) 455.661
## TotalWorkingYears 18.120
## JobRoleHuman Resources -438.939
## JobRoleLaboratory Technician -225.807
## JobRoleManager 4036.065
## JobRoleManufacturing Director -758.388
## JobRoleResearch Director 4331.253
## JobRoleResearch Scientist -446.010
## JobRoleSales Executive -1188.930
## JobRoleSales Representative -670.855
## JobLevel 2621.974
## TotalWorkingYears:BusinessTravelTravel_Frequently 11.880
## TotalWorkingYears:BusinessTravelTravel_Rarely 35.471
## TotalWorkingYears:JobRoleHuman Resources 122.214
## TotalWorkingYears:JobRoleLaboratory Technician -103.507
## TotalWorkingYears:JobRoleManager 1.700
## TotalWorkingYears:JobRoleManufacturing Director 86.018
## TotalWorkingYears:JobRoleResearch Director -14.702
## TotalWorkingYears:JobRoleResearch Scientist 14.905
## TotalWorkingYears:JobRoleSales Executive 94.084
## TotalWorkingYears:JobRoleSales Representative -16.991
## JobRoleHealthcare Representative:YearsAtCompany 21.713
## JobRoleHuman Resources:YearsAtCompany -208.595
## JobRoleLaboratory Technician:YearsAtCompany 19.057
## JobRoleManager:YearsAtCompany 35.804
## JobRoleManufacturing Director:YearsAtCompany -7.398
## JobRoleResearch Director:YearsAtCompany 23.671
## JobRoleResearch Scientist:YearsAtCompany -55.573
## JobRoleSales Executive:YearsAtCompany -11.063
## JobRoleSales Representative:YearsAtCompany -39.249
## JobRoleHealthcare Representative:DepartmentResearch & Development NA
## JobRoleHuman Resources:DepartmentResearch & Development NA
## JobRoleLaboratory Technician:DepartmentResearch & Development NA
## JobRoleManager:DepartmentResearch & Development -217.754
## JobRoleManufacturing Director:DepartmentResearch & Development NA
## JobRoleResearch Director:DepartmentResearch & Development NA
## JobRoleResearch Scientist:DepartmentResearch & Development NA
## JobRoleSales Executive:DepartmentResearch & Development NA
## JobRoleSales Representative:DepartmentResearch & Development NA
## JobRoleHealthcare Representative:DepartmentSales NA
## JobRoleHuman Resources:DepartmentSales NA
## JobRoleLaboratory Technician:DepartmentSales NA
## JobRoleManager:DepartmentSales -813.402
## JobRoleManufacturing Director:DepartmentSales NA
## JobRoleResearch Director:DepartmentSales NA
## JobRoleResearch Scientist:DepartmentSales NA
## JobRoleSales Executive:DepartmentSales NA
## JobRoleSales Representative:DepartmentSales NA
## Std. Error
## (Intercept) 410.873
## TotalWorkingYears 32.813
## JobRoleHuman Resources 847.161
## JobRoleLaboratory Technician 430.290
## JobRoleManager 1353.665
## JobRoleManufacturing Director 525.482
## JobRoleResearch Director 819.602
## JobRoleResearch Scientist 430.688
## JobRoleSales Executive 435.775
## JobRoleSales Representative 507.983
## JobLevel 112.669
## TotalWorkingYears:BusinessTravelTravel_Frequently 16.155
## TotalWorkingYears:BusinessTravelTravel_Rarely 14.236
## TotalWorkingYears:JobRoleHuman Resources 178.859
## TotalWorkingYears:JobRoleLaboratory Technician 43.558
## TotalWorkingYears:JobRoleManager 48.983
## TotalWorkingYears:JobRoleManufacturing Director 41.826
## TotalWorkingYears:JobRoleResearch Director 43.845
## TotalWorkingYears:JobRoleResearch Scientist 40.707
## TotalWorkingYears:JobRoleSales Executive 41.263
## TotalWorkingYears:JobRoleSales Representative 62.389
## JobRoleHealthcare Representative:YearsAtCompany 27.869
## JobRoleHuman Resources:YearsAtCompany 183.278
## JobRoleLaboratory Technician:YearsAtCompany 33.032
## JobRoleManager:YearsAtCompany 22.023
## JobRoleManufacturing Director:YearsAtCompany 37.870
## JobRoleResearch Director:YearsAtCompany 26.571
## JobRoleResearch Scientist:YearsAtCompany 32.143
## JobRoleSales Executive:YearsAtCompany 24.057
## JobRoleSales Representative:YearsAtCompany 107.211
## JobRoleHealthcare Representative:DepartmentResearch & Development NA
## JobRoleHuman Resources:DepartmentResearch & Development NA
## JobRoleLaboratory Technician:DepartmentResearch & Development NA
## JobRoleManager:DepartmentResearch & Development 788.342
## JobRoleManufacturing Director:DepartmentResearch & Development NA
## JobRoleResearch Director:DepartmentResearch & Development NA
## JobRoleResearch Scientist:DepartmentResearch & Development NA
## JobRoleSales Executive:DepartmentResearch & Development NA
## JobRoleSales Representative:DepartmentResearch & Development NA
## JobRoleHealthcare Representative:DepartmentSales NA
## JobRoleHuman Resources:DepartmentSales NA
## JobRoleLaboratory Technician:DepartmentSales NA
## JobRoleManager:DepartmentSales 754.045
## JobRoleManufacturing Director:DepartmentSales NA
## JobRoleResearch Director:DepartmentSales NA
## JobRoleResearch Scientist:DepartmentSales NA
## JobRoleSales Executive:DepartmentSales NA
## JobRoleSales Representative:DepartmentSales NA
## t value
## (Intercept) 1.109
## TotalWorkingYears 0.552
## JobRoleHuman Resources -0.518
## JobRoleLaboratory Technician -0.525
## JobRoleManager 2.982
## JobRoleManufacturing Director -1.443
## JobRoleResearch Director 5.285
## JobRoleResearch Scientist -1.036
## JobRoleSales Executive -2.728
## JobRoleSales Representative -1.321
## JobLevel 23.271
## TotalWorkingYears:BusinessTravelTravel_Frequently 0.735
## TotalWorkingYears:BusinessTravelTravel_Rarely 2.492
## TotalWorkingYears:JobRoleHuman Resources 0.683
## TotalWorkingYears:JobRoleLaboratory Technician -2.376
## TotalWorkingYears:JobRoleManager 0.035
## TotalWorkingYears:JobRoleManufacturing Director 2.057
## TotalWorkingYears:JobRoleResearch Director -0.335
## TotalWorkingYears:JobRoleResearch Scientist 0.366
## TotalWorkingYears:JobRoleSales Executive 2.280
## TotalWorkingYears:JobRoleSales Representative -0.272
## JobRoleHealthcare Representative:YearsAtCompany 0.779
## JobRoleHuman Resources:YearsAtCompany -1.138
## JobRoleLaboratory Technician:YearsAtCompany 0.577
## JobRoleManager:YearsAtCompany 1.626
## JobRoleManufacturing Director:YearsAtCompany -0.195
## JobRoleResearch Director:YearsAtCompany 0.891
## JobRoleResearch Scientist:YearsAtCompany -1.729
## JobRoleSales Executive:YearsAtCompany -0.460
## JobRoleSales Representative:YearsAtCompany -0.366
## JobRoleHealthcare Representative:DepartmentResearch & Development NA
## JobRoleHuman Resources:DepartmentResearch & Development NA
## JobRoleLaboratory Technician:DepartmentResearch & Development NA
## JobRoleManager:DepartmentResearch & Development -0.276
## JobRoleManufacturing Director:DepartmentResearch & Development NA
## JobRoleResearch Director:DepartmentResearch & Development NA
## JobRoleResearch Scientist:DepartmentResearch & Development NA
## JobRoleSales Executive:DepartmentResearch & Development NA
## JobRoleSales Representative:DepartmentResearch & Development NA
## JobRoleHealthcare Representative:DepartmentSales NA
## JobRoleHuman Resources:DepartmentSales NA
## JobRoleLaboratory Technician:DepartmentSales NA
## JobRoleManager:DepartmentSales -1.079
## JobRoleManufacturing Director:DepartmentSales NA
## JobRoleResearch Director:DepartmentSales NA
## JobRoleResearch Scientist:DepartmentSales NA
## JobRoleSales Executive:DepartmentSales NA
## JobRoleSales Representative:DepartmentSales NA
## Pr(>|t|)
## (Intercept) 0.26809
## TotalWorkingYears 0.58111
## JobRoleHuman Resources 0.60465
## JobRoleLaboratory Technician 0.60003
## JobRoleManager 0.00304 **
## JobRoleManufacturing Director 0.14974
## JobRoleResearch Director 2.07e-07 ***
## JobRoleResearch Scientist 0.30102
## JobRoleSales Executive 0.00665 **
## JobRoleSales Representative 0.18738
## JobLevel < 2e-16 ***
## TotalWorkingYears:BusinessTravelTravel_Frequently 0.46253
## TotalWorkingYears:BusinessTravelTravel_Rarely 0.01312 *
## TotalWorkingYears:JobRoleHuman Resources 0.49481
## TotalWorkingYears:JobRoleLaboratory Technician 0.01796 *
## TotalWorkingYears:JobRoleManager 0.97234
## TotalWorkingYears:JobRoleManufacturing Director 0.04037 *
## TotalWorkingYears:JobRoleResearch Director 0.73755
## TotalWorkingYears:JobRoleResearch Scientist 0.71443
## TotalWorkingYears:JobRoleSales Executive 0.02312 *
## TotalWorkingYears:JobRoleSales Representative 0.78550
## JobRoleHealthcare Representative:YearsAtCompany 0.43637
## JobRoleHuman Resources:YearsAtCompany 0.25574
## JobRoleLaboratory Technician:YearsAtCompany 0.56431
## JobRoleManager:YearsAtCompany 0.10479
## JobRoleManufacturing Director:YearsAtCompany 0.84521
## JobRoleResearch Director:YearsAtCompany 0.37354
## JobRoleResearch Scientist:YearsAtCompany 0.08459 .
## JobRoleSales Executive:YearsAtCompany 0.64586
## JobRoleSales Representative:YearsAtCompany 0.71449
## JobRoleHealthcare Representative:DepartmentResearch & Development NA
## JobRoleHuman Resources:DepartmentResearch & Development NA
## JobRoleLaboratory Technician:DepartmentResearch & Development NA
## JobRoleManager:DepartmentResearch & Development 0.78252
## JobRoleManufacturing Director:DepartmentResearch & Development NA
## JobRoleResearch Director:DepartmentResearch & Development NA
## JobRoleResearch Scientist:DepartmentResearch & Development NA
## JobRoleSales Executive:DepartmentResearch & Development NA
## JobRoleSales Representative:DepartmentResearch & Development NA
## JobRoleHealthcare Representative:DepartmentSales NA
## JobRoleHuman Resources:DepartmentSales NA
## JobRoleLaboratory Technician:DepartmentSales NA
## JobRoleManager:DepartmentSales 0.28136
## JobRoleManufacturing Director:DepartmentSales NA
## JobRoleResearch Director:DepartmentSales NA
## JobRoleResearch Scientist:DepartmentSales NA
## JobRoleSales Executive:DepartmentSales NA
## JobRoleSales Representative:DepartmentSales NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1000 on 402 degrees of freedom
## Multiple R-squared: 0.9585, Adjusted R-squared: 0.9553
## F-statistic: 299.8 on 31 and 402 DF, p-value: < 2.2e-16
#prediction error
preds.lm.lasso <- predict(lm.lasso, test)
print("RMSE lasso 1:")
## [1] "RMSE lasso 1:"
RMSE(preds.lm.lasso, test$MonthlyIncome)
## [1] 1098.978
preds.lm.lasso2 <- predict(lm.lasso2, test)
## Warning in predict.lm(lm.lasso2, test): prediction from a rank-deficient fit may
## be misleading
print("RMSE lasso 2:")
## [1] "RMSE lasso 2:"
RMSE(preds.lm.lasso2, test$MonthlyIncome)
## [1] 1078.853
Forward step model choosing from all variables plus interactions of manually selected variables
#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]
#find the forward selection variables
model.null<-lm(MonthlyIncome ~ 1, data=employees)
model.complex <- lm(MonthlyIncome ~ (Age + EmployeeNumber + NumCompaniesWorked +
TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + Attrition + BusinessTravel + Department + Education + EducationField + JobRole + StockOptionLevel)^2 + ., data=train)
model.forward <- step(model.null,
scope = list(upper=model.complex),
direction="forward",
data=employees)
## Start: AIC=14674.96
## MonthlyIncome ~ 1
##
## Df Sum of Sq RSS AIC
## + JobLevel 1 1.6636e+10 1.7337e+09 12623
## + JobRole 8 1.4888e+10 3.4814e+09 13244
## + TotalWorkingYears 1 1.1133e+10 7.2362e+09 13866
## + YearsAtCompany 1 4.4354e+09 1.3934e+10 14436
## + Age 1 4.3083e+09 1.4061e+10 14444
## + YearsInCurrentRole 1 2.4051e+09 1.5965e+10 14555
## + YearsWithCurrManager 1 1.9822e+09 1.6387e+10 14578
## + YearsSinceLastPromotion 1 1.8333e+09 1.6536e+10 14586
## + NumCompaniesWorked 1 4.4644e+08 1.7923e+10 14656
## + Attrition 1 4.4085e+08 1.7929e+10 14656
## + Education 4 3.5197e+08 1.8018e+10 14666
## + MaritalStatus 2 1.9261e+08 1.8177e+10 14670
## + MonthlyRate 1 7.6645e+07 1.8293e+10 14673
## + StockOptionLevel 3 1.5399e+08 1.8216e+10 14674
## + Gender 1 5.5886e+07 1.8314e+10 14674
## + PercentSalaryHike 1 5.3300e+07 1.8316e+10 14674
## + ID 1 4.8984e+07 1.8321e+10 14675
## <none> 1.8370e+10 14675
## + PerformanceRating 1 3.4173e+07 1.8335e+10 14675
## + Department 2 7.5183e+07 1.8294e+10 14675
## + BusinessTravel 2 7.3567e+07 1.8296e+10 14676
## + TrainingTimesLastYear 1 2.7961e+07 1.8342e+10 14676
## + EmployeeNumber 1 2.1490e+07 1.8348e+10 14676
## + OverTime 1 1.1729e+07 1.8358e+10 14676
## + DistanceFromHome 1 8.1655e+05 1.8369e+10 14677
## + HourlyRate 1 1.0503e+05 1.8370e+10 14677
## + DailyRate 1 1.4200e+02 1.8370e+10 14677
## + JobSatisfaction 3 5.6010e+07 1.8314e+10 14678
## + EducationField 5 1.1661e+08 1.8253e+10 14679
## + WorkLifeBalance 3 3.1221e+07 1.8338e+10 14680
## + EnvironmentSatisfaction 3 3.0267e+07 1.8339e+10 14680
## + RelationshipSatisfaction 3 2.0585e+07 1.8349e+10 14680
## + JobInvolvement 3 5.1176e+06 1.8365e+10 14681
##
## Step: AIC=12623.4
## MonthlyIncome ~ JobLevel
##
## Df Sum of Sq RSS AIC
## + JobRole 8 722387200 1011360314 12170
## + TotalWorkingYears 1 59348516 1674398998 12595
## + Department 2 53738569 1680008945 12600
## + Age 1 18766260 1714981254 12616
## + BusinessTravel 2 22678748 1711068767 12616
## + DistanceFromHome 1 13740838 1720006676 12618
## + NumCompaniesWorked 1 8964138 1724783376 12621
## + YearsWithCurrManager 1 8790603 1724956912 12621
## + EducationField 5 23514517 1710232998 12622
## <none> 1733747514 12623
## + ID 1 3750153 1729997361 12624
## + MaritalStatus 2 6608834 1727138681 12624
## + YearsInCurrentRole 1 2410053 1731337461 12624
## + Gender 1 2066377 1731681137 12624
## + HourlyRate 1 1863245 1731884269 12624
## + JobInvolvement 3 9325826 1724421688 12625
## + EmployeeNumber 1 1152961 1732594553 12625
## + PerformanceRating 1 731501 1733016014 12625
## + MonthlyRate 1 723684 1733023830 12625
## + PercentSalaryHike 1 535748 1733211766 12625
## + YearsAtCompany 1 409075 1733338439 12625
## + TrainingTimesLastYear 1 394917 1733352598 12625
## + DailyRate 1 50672 1733696843 12625
## + YearsSinceLastPromotion 1 43699 1733703815 12625
## + OverTime 1 29064 1733718450 12625
## + Attrition 1 7238 1733740276 12625
## + RelationshipSatisfaction 3 7455506 1726292008 12626
## + EnvironmentSatisfaction 3 6529811 1727217704 12626
## + StockOptionLevel 3 4430571 1729316944 12627
## + WorkLifeBalance 3 2755349 1730992165 12628
## + JobSatisfaction 3 1406896 1732340618 12629
## + Education 4 5345894 1728401620 12629
##
## Step: AIC=12170.48
## MonthlyIncome ~ JobLevel + JobRole
##
## Df Sum of Sq RSS AIC
## + TotalWorkingYears 1 41160686 970199629 12136
## + BusinessTravel 2 14967502 996392812 12162
## + YearsSinceLastPromotion 1 9817643 1001542671 12164
## + Age 1 9789787 1001570528 12164
## + YearsAtCompany 1 4037001 1007323313 12169
## + NumCompaniesWorked 1 4036366 1007323948 12169
## + ID 1 3714160 1007646154 12169
## + DistanceFromHome 1 3615890 1007744424 12169
## + MonthlyRate 1 3392275 1007968040 12170
## + DailyRate 1 3029272 1008331042 12170
## + Education 4 9867535 1001492780 12170
## + Gender 1 2661752 1008698562 12170
## + YearsInCurrentRole 1 2566679 1008793635 12170
## + PerformanceRating 1 2497230 1008863084 12170
## <none> 1011360314 12170
## + EmployeeNumber 1 2146125 1009214189 12171
## + TrainingTimesLastYear 1 350625 1011009690 12172
## + Attrition 1 159796 1011200518 12172
## + PercentSalaryHike 1 84128 1011276187 12172
## + HourlyRate 1 64570 1011295744 12172
## + YearsWithCurrManager 1 38404 1011321911 12172
## + OverTime 1 950 1011359365 12172
## + Department 2 1652354 1009707960 12173
## + MaritalStatus 2 1504761 1009855553 12173
## + WorkLifeBalance 3 3656169 1007704146 12173
## + StockOptionLevel 3 2837486 1008522828 12174
## + RelationshipSatisfaction 3 2310928 1009049386 12174
## + JobSatisfaction 3 1851173 1009509141 12175
## + JobInvolvement 3 1831830 1009528484 12175
## + EnvironmentSatisfaction 3 274454 1011085860 12176
## + EducationField 5 1448401 1009911913 12179
##
## Step: AIC=12136.33
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears
##
## Df Sum of Sq RSS AIC
## + TotalWorkingYears:JobRole 8 69573387 900626241 12088
## + BusinessTravel 2 14325268 955874361 12127
## + YearsWithCurrManager 1 3661817 966537812 12135
## + Gender 1 3545977 966653651 12135
## + MonthlyRate 1 3453163 966746466 12135
## + ID 1 3270835 966928793 12135
## + DailyRate 1 3163711 967035918 12136
## + DistanceFromHome 1 3136452 967063177 12136
## <none> 970199629 12136
## + PerformanceRating 1 2189208 968010421 12136
## + EmployeeNumber 1 1996410 968203219 12136
## + YearsSinceLastPromotion 1 1318418 968881210 12137
## + Education 4 7804863 962394766 12137
## + Attrition 1 644137 969555492 12138
## + YearsAtCompany 1 586481 969613147 12138
## + Department 2 2647259 967552370 12138
## + TrainingTimesLastYear 1 395815 969803813 12138
## + NumCompaniesWorked 1 288899 969910730 12138
## + YearsInCurrentRole 1 281184 969918445 12138
## + OverTime 1 26815 970172814 12138
## + PercentSalaryHike 1 14208 970185420 12138
## + HourlyRate 1 10665 970188964 12138
## + Age 1 4644 970194985 12138
## + WorkLifeBalance 3 3373073 966826556 12139
## + MaritalStatus 2 950845 969248783 12140
## + RelationshipSatisfaction 3 2520254 967679375 12140
## + StockOptionLevel 3 1686890 968512738 12141
## + JobSatisfaction 3 1512963 968686666 12141
## + JobInvolvement 3 1096298 969103331 12141
## + EnvironmentSatisfaction 3 85820 970113809 12142
## + EducationField 5 1829621 968370008 12145
##
## Step: AIC=12087.59
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + JobRole:TotalWorkingYears
##
## Df Sum of Sq RSS AIC
## + BusinessTravel 2 15130983 885495259 12077
## + DailyRate 1 4259385 896366857 12086
## + Gender 1 3421421 897204820 12086
## + ID 1 3362793 897263448 12086
## + DistanceFromHome 1 2724073 897902169 12087
## + MonthlyRate 1 2551486 898074755 12087
## <none> 900626241 12088
## + PerformanceRating 1 1962232 898664009 12088
## + EmployeeNumber 1 1377351 899248891 12088
## + NumCompaniesWorked 1 933019 899693223 12089
## + Department 2 2953976 897672265 12089
## + YearsWithCurrManager 1 714405 899911836 12089
## + TrainingTimesLastYear 1 423706 900202535 12089
## + YearsAtCompany 1 187385 900438857 12089
## + YearsSinceLastPromotion 1 180031 900446210 12089
## + YearsInCurrentRole 1 147890 900478352 12090
## + HourlyRate 1 53797 900572444 12090
## + PercentSalaryHike 1 37841 900588400 12090
## + Age 1 14316 900611925 12090
## + Attrition 1 1835 900624406 12090
## + OverTime 1 216 900626026 12090
## + Education 4 5587649 895038592 12090
## + MaritalStatus 2 940889 899685352 12091
## + RelationshipSatisfaction 3 2180819 898445422 12092
## + JobSatisfaction 3 2081901 898544341 12092
## + WorkLifeBalance 3 1987383 898638858 12092
## + JobInvolvement 3 1756180 898870061 12092
## + StockOptionLevel 3 947934 899678308 12093
## + EnvironmentSatisfaction 3 876975 899749266 12093
## + EducationField 5 2191845 898434397 12096
##
## Step: AIC=12076.85
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## JobRole:TotalWorkingYears
##
## Df Sum of Sq RSS AIC
## + DailyRate 1 4685745 880809514 12074
## + Gender 1 3617564 881877694 12075
## + ID 1 3099132 882396126 12076
## + TotalWorkingYears:BusinessTravel 2 4861257 880634002 12076
## + MonthlyRate 1 2640304 882854954 12076
## <none> 885495259 12077
## + DistanceFromHome 1 1909965 883585294 12077
## + PerformanceRating 1 1813918 883681341 12077
## + Department 2 3725302 881769957 12077
## + EmployeeNumber 1 1277817 884217442 12078
## + NumCompaniesWorked 1 1195368 884299891 12078
## + YearsWithCurrManager 1 460265 885034993 12078
## + YearsSinceLastPromotion 1 386696 885108563 12078
## + TrainingTimesLastYear 1 368327 885126932 12078
## + YearsAtCompany 1 134176 885361082 12079
## + YearsInCurrentRole 1 132213 885363046 12079
## + HourlyRate 1 74757 885420501 12079
## + Age 1 60913 885434345 12079
## + OverTime 1 24933 885470326 12079
## + PercentSalaryHike 1 17907 885477352 12079
## + Attrition 1 6338 885488921 12079
## + Education 4 5473016 880022243 12080
## + MaritalStatus 2 525792 884969467 12080
## + RelationshipSatisfaction 3 2087177 883408082 12081
## + JobSatisfaction 3 1833142 883662117 12081
## + WorkLifeBalance 3 1610533 883884726 12081
## + JobInvolvement 3 1498776 883996483 12081
## + StockOptionLevel 3 1053946 884441312 12082
## + EnvironmentSatisfaction 3 789969 884705289 12082
## + EducationField 5 2005697 883489562 12085
## + BusinessTravel:JobRole 16 4474286 881020972 12104
##
## Step: AIC=12074.24
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## DailyRate + JobRole:TotalWorkingYears
##
## Df Sum of Sq RSS AIC
## + Gender 1 3717903 877091611 12073
## + TotalWorkingYears:BusinessTravel 2 4953275 875856239 12073
## + ID 1 2850825 877958689 12073
## + MonthlyRate 1 2489631 878319883 12074
## <none> 880809514 12074
## + DistanceFromHome 1 1937404 878872110 12074
## + Department 2 3892746 876916767 12074
## + PerformanceRating 1 1690986 879118528 12075
## + EmployeeNumber 1 1491301 879318213 12075
## + NumCompaniesWorked 1 999804 879809710 12075
## + YearsSinceLastPromotion 1 547055 880262459 12076
## + TrainingTimesLastYear 1 397499 880412015 12076
## + YearsWithCurrManager 1 334068 880475446 12076
## + YearsInCurrentRole 1 161981 880647533 12076
## + HourlyRate 1 151536 880657978 12076
## + YearsAtCompany 1 77052 880732462 12076
## + Age 1 42012 880767502 12076
## + PercentSalaryHike 1 38379 880771135 12076
## + OverTime 1 29157 880780357 12076
## + Attrition 1 1842 880807672 12076
## + Education 4 5645632 875163882 12077
## + MaritalStatus 2 453020 880356494 12078
## + RelationshipSatisfaction 3 1984894 878824620 12078
## + JobSatisfaction 3 1887121 878922393 12078
## + WorkLifeBalance 3 1669308 879140206 12079
## + JobInvolvement 3 1349136 879460378 12079
## + StockOptionLevel 3 948685 879860829 12079
## + EnvironmentSatisfaction 3 724954 880084560 12080
## + EducationField 5 1985368 878824146 12082
## + BusinessTravel:JobRole 16 4305914 876503600 12102
##
## Step: AIC=12072.56
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## DailyRate + Gender + JobRole:TotalWorkingYears
##
## Df Sum of Sq RSS AIC
## + TotalWorkingYears:BusinessTravel 2 4863805 872227806 12072
## + ID 1 2783995 874307616 12072
## + MonthlyRate 1 2362447 874729164 12072
## <none> 877091611 12073
## + DistanceFromHome 1 1860842 875230769 12073
## + Department 2 3739550 873352061 12073
## + PerformanceRating 1 1605947 875485664 12073
## + EmployeeNumber 1 1526961 875564650 12073
## + NumCompaniesWorked 1 966921 876124689 12074
## + YearsSinceLastPromotion 1 537724 876553887 12074
## + TrainingTimesLastYear 1 406328 876685283 12074
## + YearsInCurrentRole 1 251176 876840435 12074
## + YearsWithCurrManager 1 231171 876860440 12074
## + HourlyRate 1 182167 876909444 12074
## + YearsAtCompany 1 43746 877047865 12074
## + Age 1 40204 877051407 12074
## + PercentSalaryHike 1 39316 877052295 12074
## + OverTime 1 34861 877056750 12074
## + Attrition 1 5515 877086095 12075
## + Education 4 5536801 871554810 12075
## + MaritalStatus 2 653056 876438555 12076
## + WorkLifeBalance 3 1799081 875292530 12077
## + JobSatisfaction 3 1663257 875428354 12077
## + RelationshipSatisfaction 3 1612439 875479172 12077
## + JobInvolvement 3 1244140 875847471 12077
## + StockOptionLevel 3 982402 876109209 12078
## + EnvironmentSatisfaction 3 769376 876322235 12078
## + EducationField 5 1844361 875247250 12081
## + BusinessTravel:JobRole 16 4139017 872952594 12100
##
## Step: AIC=12071.72
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## DailyRate + Gender + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel
##
## Df Sum of Sq RSS AIC
## + ID 1 2348104 869879702 12071
## + MonthlyRate 1 2269073 869958733 12072
## + Department 2 4118126 868109680 12072
## <none> 872227806 12072
## + EmployeeNumber 1 1618510 870609296 12072
## + PerformanceRating 1 1499677 870728129 12072
## + DistanceFromHome 1 1429645 870798161 12072
## + NumCompaniesWorked 1 935399 871292407 12073
## + YearsSinceLastPromotion 1 481788 871746018 12073
## + TrainingTimesLastYear 1 321058 871906748 12073
## + YearsWithCurrManager 1 245844 871981962 12074
## + YearsInCurrentRole 1 170989 872056818 12074
## + HourlyRate 1 76665 872151141 12074
## + Attrition 1 57292 872170514 12074
## + Age 1 49458 872178348 12074
## + YearsAtCompany 1 39457 872188349 12074
## + OverTime 1 38699 872189107 12074
## + PercentSalaryHike 1 21429 872206377 12074
## + MaritalStatus 2 772860 871454946 12075
## + Education 4 4715081 867512725 12075
## + WorkLifeBalance 3 1811417 870416389 12076
## + RelationshipSatisfaction 3 1602719 870625087 12076
## + JobSatisfaction 3 1589572 870638234 12076
## + JobInvolvement 3 1392219 870835587 12076
## + EnvironmentSatisfaction 3 976818 871250988 12077
## + StockOptionLevel 3 717756 871510050 12077
## + EducationField 5 1826862 870400944 12080
## + BusinessTravel:JobRole 16 5677317 866550489 12098
##
## Step: AIC=12071.37
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## DailyRate + Gender + ID + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel
##
## Df Sum of Sq RSS AIC
## + MonthlyRate 1 2334251 867545451 12071
## + Department 2 4273767 865605935 12071
## <none> 869879702 12071
## + EmployeeNumber 1 1507053 868372649 12072
## + PerformanceRating 1 1473002 868406700 12072
## + DistanceFromHome 1 1238820 868640882 12072
## + NumCompaniesWorked 1 915086 868964616 12072
## + YearsSinceLastPromotion 1 520480 869359222 12073
## + TrainingTimesLastYear 1 360724 869518978 12073
## + YearsWithCurrManager 1 284460 869595241 12073
## + YearsInCurrentRole 1 113060 869766642 12073
## + HourlyRate 1 69926 869809775 12073
## + Age 1 38634 869841068 12073
## + Attrition 1 26958 869852743 12073
## + OverTime 1 23823 869855878 12073
## + YearsAtCompany 1 19172 869860530 12073
## + PercentSalaryHike 1 15659 869864042 12073
## + Education 4 4936752 864942950 12074
## + MaritalStatus 2 881442 868998260 12074
## + WorkLifeBalance 3 1734191 868145511 12076
## + JobSatisfaction 3 1627702 868252000 12076
## + RelationshipSatisfaction 3 1537445 868342257 12076
## + JobInvolvement 3 1172744 868706958 12076
## + EnvironmentSatisfaction 3 1047576 868832126 12076
## + StockOptionLevel 3 682525 869197177 12077
## + EducationField 5 1482235 868397466 12080
## + BusinessTravel:JobRole 16 5649937 864229765 12098
##
## Step: AIC=12071.04
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## DailyRate + Gender + ID + MonthlyRate + JobRole:TotalWorkingYears +
## TotalWorkingYears:BusinessTravel
##
## Df Sum of Sq RSS AIC
## + Department 2 4240996 863304454 12071
## <none> 867545451 12071
## + EmployeeNumber 1 1674053 865871398 12071
## + PerformanceRating 1 1495227 866050224 12072
## + DistanceFromHome 1 1289911 866255540 12072
## + NumCompaniesWorked 1 924489 866620962 12072
## + YearsSinceLastPromotion 1 477651 867067800 12073
## + YearsWithCurrManager 1 405016 867140435 12073
## + TrainingTimesLastYear 1 340223 867205228 12073
## + YearsInCurrentRole 1 106127 867439324 12073
## + HourlyRate 1 83782 867461669 12073
## + Age 1 71994 867473457 12073
## + YearsAtCompany 1 66091 867479360 12073
## + Attrition 1 39060 867506391 12073
## + OverTime 1 25310 867520141 12073
## + PercentSalaryHike 1 14288 867531163 12073
## + MaritalStatus 2 929070 866616381 12074
## + Education 4 4777490 862767961 12074
## + WorkLifeBalance 3 1905462 865639989 12075
## + JobSatisfaction 3 1720853 865824597 12075
## + RelationshipSatisfaction 3 1393023 866152428 12076
## + JobInvolvement 3 1282954 866262497 12076
## + EnvironmentSatisfaction 3 935892 866609559 12076
## + StockOptionLevel 3 744152 866801299 12076
## + EducationField 5 1500745 866044705 12080
## + BusinessTravel:JobRole 16 5431376 862114075 12098
##
## Step: AIC=12070.77
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
## DailyRate + Gender + ID + MonthlyRate + Department + JobRole:TotalWorkingYears +
## TotalWorkingYears:BusinessTravel
##
## Df Sum of Sq RSS AIC
## <none> 863304454 12071
## + PerformanceRating 1 1519041 861785414 12071
## + EmployeeNumber 1 1442015 861862439 12071
## + DistanceFromHome 1 1152915 862151539 12072
## + NumCompaniesWorked 1 1035125 862269329 12072
## + YearsSinceLastPromotion 1 422382 862882072 12072
## + YearsWithCurrManager 1 365374 862939080 12072
## + TrainingTimesLastYear 1 308912 862995543 12072
## + TotalWorkingYears:Department 2 2251394 861053061 12072
## + YearsInCurrentRole 1 93417 863211037 12073
## + HourlyRate 1 83015 863221439 12073
## + YearsAtCompany 1 44217 863260237 12073
## + Attrition 1 35467 863268988 12073
## + Age 1 30786 863273668 12073
## + OverTime 1 30614 863273841 12073
## + PercentSalaryHike 1 15683 863288772 12073
## + MaritalStatus 2 847357 862457098 12074
## + Education 4 4622165 858682289 12074
## + JobInvolvement 3 1504123 861800332 12075
## + WorkLifeBalance 3 1484778 861819677 12075
## + RelationshipSatisfaction 3 1420059 861884395 12075
## + JobSatisfaction 3 1386061 861918393 12075
## + EnvironmentSatisfaction 3 1074435 862230020 12076
## + StockOptionLevel 3 705555 862598899 12076
## + BusinessTravel:Department 4 1334798 861969656 12077
## + EducationField 5 1318335 861986120 12079
## + BusinessTravel:JobRole 16 5384523 857919931 12097
coef(model.forward)
## (Intercept)
## -1.265299e+03
## JobLevel
## 2.767041e+03
## JobRoleHuman Resources
## 6.480376e+02
## JobRoleLaboratory Technician
## 4.960850e+02
## JobRoleManager
## 4.444156e+03
## JobRoleManufacturing Director
## -2.079844e+02
## JobRoleResearch Director
## 4.607222e+03
## JobRoleResearch Scientist
## 4.109792e+02
## JobRoleSales Executive
## 4.254338e+02
## JobRoleSales Representative
## 6.843187e+02
## TotalWorkingYears
## 9.240775e+01
## BusinessTravelTravel_Frequently
## 5.983496e+02
## BusinessTravelTravel_Rarely
## 5.200031e+02
## DailyRate
## 1.859171e-01
## GenderMale
## 1.270529e+02
## ID
## -2.201902e-01
## MonthlyRate
## -7.350286e-03
## DepartmentResearch & Development
## 3.078081e+02
## DepartmentSales
## -3.360249e+02
## JobRoleHuman Resources:TotalWorkingYears
## -7.836099e+01
## JobRoleLaboratory Technician:TotalWorkingYears
## -1.211658e+02
## JobRoleManager:TotalWorkingYears
## -1.864787e+01
## JobRoleManufacturing Director:TotalWorkingYears
## 2.945552e+01
## JobRoleResearch Director:TotalWorkingYears
## -3.820688e+01
## JobRoleResearch Scientist:TotalWorkingYears
## -7.756305e+01
## JobRoleSales Executive:TotalWorkingYears
## 2.001124e+01
## JobRoleSales Representative:TotalWorkingYears
## -4.755616e+01
## TotalWorkingYears:BusinessTravelTravel_Frequently
## -3.259974e+01
## TotalWorkingYears:BusinessTravelTravel_Rarely
## -9.623839e+00
#fit the model
lm.forward <- lm(MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel +
DailyRate + Gender + ID + MonthlyRate + Department + JobRole:TotalWorkingYears +
TotalWorkingYears:BusinessTravel, data=train)
#check residuals and summary
summary(lm.forward)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears +
## BusinessTravel + DailyRate + Gender + ID + MonthlyRate +
## Department + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2529.2 -642.1 -104.2 643.6 3796.5
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.292e+03 7.314e+02 -1.767
## JobLevel 2.818e+03 1.168e+02 24.127
## JobRoleHuman Resources 9.826e+02 9.589e+02 1.025
## JobRoleLaboratory Technician 6.847e+02 4.471e+02 1.532
## JobRoleManager 4.336e+03 8.803e+02 4.926
## JobRoleManufacturing Director -4.662e+01 5.100e+02 -0.091
## JobRoleResearch Director 4.427e+03 7.185e+02 6.160
## JobRoleResearch Scientist 4.212e+02 4.416e+02 0.954
## JobRoleSales Executive 3.824e+02 6.452e+02 0.593
## JobRoleSales Representative 8.553e+02 6.550e+02 1.306
## TotalWorkingYears 1.012e+02 3.115e+01 3.250
## BusinessTravelTravel_Frequently 3.524e+02 2.894e+02 1.218
## BusinessTravelTravel_Rarely 5.987e+02 2.453e+02 2.441
## DailyRate 5.167e-02 1.258e-01 0.411
## GenderMale 1.615e+02 1.001e+02 1.613
## ID -2.397e-01 1.948e-01 -1.231
## MonthlyRate -5.769e-03 7.053e-03 -0.818
## DepartmentResearch & Development 2.370e+02 5.182e+02 0.457
## DepartmentSales -4.829e+02 5.137e+02 -0.940
## JobRoleHuman Resources:TotalWorkingYears -1.784e+02 1.073e+02 -1.662
## JobRoleLaboratory Technician:TotalWorkingYears -1.443e+02 3.551e+01 -4.065
## JobRoleManager:TotalWorkingYears -3.317e+01 4.084e+01 -0.812
## JobRoleManufacturing Director:TotalWorkingYears 8.148e+00 3.531e+01 0.231
## JobRoleResearch Director:TotalWorkingYears -4.738e+01 3.857e+01 -1.228
## JobRoleResearch Scientist:TotalWorkingYears -6.978e+01 3.462e+01 -2.016
## JobRoleSales Executive:TotalWorkingYears 2.232e+01 3.149e+01 0.709
## JobRoleSales Representative:TotalWorkingYears -3.131e+01 5.165e+01 -0.606
## TotalWorkingYears:BusinessTravelTravel_Frequently -1.468e+01 2.279e+01 -0.644
## TotalWorkingYears:BusinessTravelTravel_Rarely -9.883e+00 1.950e+01 -0.507
## Pr(>|t|)
## (Intercept) 0.07806 .
## JobLevel < 2e-16 ***
## JobRoleHuman Resources 0.30612
## JobRoleLaboratory Technician 0.12639
## JobRoleManager 1.23e-06 ***
## JobRoleManufacturing Director 0.92721
## JobRoleResearch Director 1.75e-09 ***
## JobRoleResearch Scientist 0.34073
## JobRoleSales Executive 0.55367
## JobRoleSales Representative 0.19236
## TotalWorkingYears 0.00125 **
## BusinessTravelTravel_Frequently 0.22398
## BusinessTravelTravel_Rarely 0.01509 *
## DailyRate 0.68152
## GenderMale 0.10748
## ID 0.21912
## MonthlyRate 0.41390
## DepartmentResearch & Development 0.64771
## DepartmentSales 0.34777
## JobRoleHuman Resources:TotalWorkingYears 0.09720 .
## JobRoleLaboratory Technician:TotalWorkingYears 5.76e-05 ***
## JobRoleManager:TotalWorkingYears 0.41724
## JobRoleManufacturing Director:TotalWorkingYears 0.81765
## JobRoleResearch Director:TotalWorkingYears 0.22002
## JobRoleResearch Scientist:TotalWorkingYears 0.04450 *
## JobRoleSales Executive:TotalWorkingYears 0.47879
## JobRoleSales Representative:TotalWorkingYears 0.54473
## TotalWorkingYears:BusinessTravelTravel_Frequently 0.51971
## TotalWorkingYears:BusinessTravelTravel_Rarely 0.61250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1002 on 405 degrees of freedom
## Multiple R-squared: 0.9562, Adjusted R-squared: 0.9532
## F-statistic: 316 on 28 and 405 DF, p-value: < 2.2e-16
plot(lm.forward)
#prediction error
preds.lm.forward <- predict(lm.forward, test)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.forward, test$MonthlyIncome)
## [1] 1046.91
The final picked linear regression model has the following coefficients. The coefficients can be interpreted as (here are a few examples; the rest would follow the same format):
#show the coefficients of the favorite linear regression model
lm.forward
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears +
## BusinessTravel + DailyRate + Gender + ID + MonthlyRate +
## Department + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel,
## data = train)
##
## Coefficients:
## (Intercept)
## -1.292e+03
## JobLevel
## 2.818e+03
## JobRoleHuman Resources
## 9.826e+02
## JobRoleLaboratory Technician
## 6.847e+02
## JobRoleManager
## 4.336e+03
## JobRoleManufacturing Director
## -4.662e+01
## JobRoleResearch Director
## 4.427e+03
## JobRoleResearch Scientist
## 4.212e+02
## JobRoleSales Executive
## 3.824e+02
## JobRoleSales Representative
## 8.553e+02
## TotalWorkingYears
## 1.012e+02
## BusinessTravelTravel_Frequently
## 3.524e+02
## BusinessTravelTravel_Rarely
## 5.987e+02
## DailyRate
## 5.167e-02
## GenderMale
## 1.615e+02
## ID
## -2.397e-01
## MonthlyRate
## -5.769e-03
## DepartmentResearch & Development
## 2.370e+02
## DepartmentSales
## -4.829e+02
## JobRoleHuman Resources:TotalWorkingYears
## -1.784e+02
## JobRoleLaboratory Technician:TotalWorkingYears
## -1.443e+02
## JobRoleManager:TotalWorkingYears
## -3.317e+01
## JobRoleManufacturing Director:TotalWorkingYears
## 8.148e+00
## JobRoleResearch Director:TotalWorkingYears
## -4.738e+01
## JobRoleResearch Scientist:TotalWorkingYears
## -6.978e+01
## JobRoleSales Executive:TotalWorkingYears
## 2.232e+01
## JobRoleSales Representative:TotalWorkingYears
## -3.131e+01
## TotalWorkingYears:BusinessTravelTravel_Frequently
## -1.468e+01
## TotalWorkingYears:BusinessTravelTravel_Rarely
## -9.883e+00
Since KNN is about looking like your nearest neightbors, I will do some additional exploratory plots to see what factors make similar neighbors with each other in regards to income. It looks like Total Working Years split up by Job Level give some of the best separation between income levels. The Principal Components also give some amount of distinction but are still quite muddled.
#Plots of working years vs income, colored by various elements to see if they make a good split
employees %>%
mutate(PromotionBins = cut(employees$YearsSinceLastPromotion, 10)) %>%
ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=PromotionBins)) +
geom_point()
employees %>%
mutate(TenureBins = cut(employees$YearsAtCompany, 10)) %>%
ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=TenureBins)) +
geom_point()
employees %>%
mutate(ManagerBins = cut(employees$YearsWithCurrManager, 10)) %>%
ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=ManagerBins)) +
geom_point()
employees %>%
mutate(RoleBins = cut(employees$YearsInCurrentRole, 10)) %>%
ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=RoleBins)) +
geom_point()
employees %>%
ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=factor(JobLevel))) +
geom_point()
employees %>%
mutate(IncomeBins = cut(employees$MonthlyIncome, 10)) %>%
ggplot(aes(x=TotalWorkingYears, JobLevel, colour=IncomeBins)) +
geom_point()
employees_yrs <- employees %>%
select(c(TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager))
employees_pcs <- prcomp(employees_yrs, scale = TRUE)
employees %>%
ggplot(aes(x=employees_pcs$x[,1], y=MonthlyIncome)) +
geom_point()
employees %>%
mutate(IncomeBins = cut(employees$MonthlyIncome, 10)) %>%
mutate(PC1 = employees_pcs$x[,1]) %>%
mutate(PC2 = employees_pcs$x[,2]) %>%
ggplot(aes(x=PC1, y=PC2, colour=IncomeBins)) +
geom_point()
I run KNN a few different times–one with everything, one with a few manually selected models, one with just JobLevel and TotalWorkingYears, and one with principal components. The one with just JobLevel and TotalWorkingYears is the best here, but all of these KNN models are a long shot from the linear regression models!
Everything RMSE: 7701 Manual RMSE: 7677 JobLevel TotalWorkingYears RMSE: 7675 PCs RMSE: 7685
employees_z2 <- employees_z %>%
mutate(MonthlyIncome = employees_num$MonthlyIncome) %>%
mutate(zAttrition = scale(as.numeric(Attrition)))
employees_z2$Attrition <- NULL
employees_z2$zMonthlyIncome <- NULL
employees_z2$PC1 <- employees_pcs$x[,1]
employees_z2$PC2 <- employees_pcs$x[,2]
employees_z2$PC3 <- employees_pcs$x[,3]
#train test split
folds <- createFolds(employees_z2$MonthlyIncome, k=2)
train <- employees_z2[folds$Fold1,]
test <- employees_z2[folds$Fold2,]
##run knn with everything
preds.knn <- knn(train[, names(train) != "MonthlyIncome"], test[, names(train) != "MonthlyIncome"], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)
#gross
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7664.69
##run knn with just a few manually selected variables
preds.knn <- knn(train[, c("zJobRole", "zJobLevel", "zTotalWorkingYears")], test[, c("zJobRole", "zJobLevel", "zTotalWorkingYears")], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7630.55
#not bad
##run knn with only Job Level and TotalWorkingYears
preds.knn <- knn(train[, c("zJobLevel", "zTotalWorkingYears")], test[, c("zJobLevel", "zTotalWorkingYears")], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7627.026
#not bad
##run knn with PCs
preds.knn <- knn(train[, c("zJobRole", "zJobLevel", "PC1", "PC2", "PC3")], test[, c("zJobRole", "zJobLevel", "PC1", "PC2", "PC3")], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7638.09
Linear Regression with interaction terms wins as a predictive model of income, with the lowest RMSE of 999. The model formula is:
MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + DailyRate + Gender + ID + MonthlyRate + Department + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel
Overall, the linear regression fit much better than the KNN algorithm for predicting income.
We went over analysis of job role trends, made a model for predicting attrition, and made a model for predicting income.
Highlights of our job role trends analysis:
Highlights of our attrition model:
Highlights of our income model: